# 第四章 逻辑回归模型 - 股票客户流失预警模型

# 1.案例实战 - 股票客户流失预警模型

In [1]:
# 1.读取数据
import pandas as pd
df = pd.read_excel('股票客户流失.xlsx')
df.head()

Unnamed: 0,账户资金（元）,最后一次交易距今时间（天）,上月交易佣金（元）,累计交易佣金（元）,本券商使用时长（年）,是否流失
0,22686.5,297,149.25,2029.85,0,0
1,190055.0,42,284.75,3889.5,2,0
2,29733.5,233,269.25,2108.15,0,1
3,185667.5,44,211.5,3840.75,3,0
4,33648.5,213,353.5,2151.65,0,1


In [2]:
# 2.划分特征变量和目标变量
X = df.drop(columns='是否流失') 
y = df['是否流失']   

In [3]:
# 3.划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)  # 设置random_state使得每次划分的数据一样

X_train.head()  # 显示训练集X_train的前5行，在别的代码编辑器里需要通过print()函数打印查看
# y_train.head()  # 显示训练集y_train的前5行，在别的代码编辑器里需要通过print()函数打印查看
# X_test.head()  # 显示测试集X_test的前5行，在别的代码编辑器里需要通过print()函数打印查看
# y_test.head()  # 显示测试集y_test的前5行，在别的代码编辑器里需要通过print()函数打印查看

Unnamed: 0,账户资金（元）,最后一次交易距今时间（天）,上月交易佣金（元）,累计交易佣金（元）,本券商使用时长（年）
1814,43251.5,192,98.5,2258.35,0
5946,304449.5,22,369.5,5160.55,3
3881,441357.5,9,325.75,6681.75,5
2389,587076.5,2,427.25,8300.85,5
3676,204027.5,39,352.0,4044.75,2


In [4]:
# 4.模型搭建
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [5]:
# 5.模型使用1 - 预测数据结果
y_pred = model.predict(X_test)
print(y_pred[0:100])  # 打印预测内容的前100个看看

[0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1]


In [6]:
# 放到一个DataFrame里进行查看比对
a = pd.DataFrame()  # 创建一个空DataFrame 
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
a.head()  # 可以看到此时前5个预测准确度为80%

Unnamed: 0,预测值,实际值
0,0,0
1,0,0
2,0,0
3,0,1
4,0,0


In [7]:
# 查看全部的预测准确度
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print(score)

0.7977288857345636


In [8]:
# 另外一种查看模型预测准确度的方法
model.score(X_test, y_test)

0.7977288857345636

In [9]:
# 6.模型使用2 - 预测概率
y_pred_proba = model.predict_proba(X_test)  
y_pred_proba[0:5]

array([[0.82041491, 0.17958509],
       [0.84029613, 0.15970387],
       [0.79819342, 0.20180658],
       [0.62989192, 0.37010808],
       [0.61636611, 0.38363389]])

In [10]:
# 另一种查看概率的方式
a = pd.DataFrame(y_pred_proba, columns=['不流失概率', '流失概率'])
a.head()

Unnamed: 0,不流失概率,流失概率
0,0.820415,0.179585
1,0.840296,0.159704
2,0.798193,0.201807
3,0.629892,0.370108
4,0.616366,0.383634


In [11]:
# 只查看流失的概率（也即y=1概率，即上面二维数组的第二列）
y_pred_proba[:,1]

array([0.17958509, 0.15970387, 0.20180658, ..., 0.04220544, 0.09782449,
       0.63586739])

In [12]:
# 7.查看各个特征变量的系数（额外知识点，供参考）
model.coef_

array([[ 2.41952469e-05,  8.16881490e-03,  1.04320950e-02,
        -2.54894468e-03, -1.10120609e-04]])

In [13]:
model.intercept_

array([-1.43393291e-06])

In [14]:
# 通过公式获取流失概率
import numpy as np
for i in range(5):  # 这里演示前5条测试集数据的预测流失的概率
    print(1 / (1 + np.exp(-(np.dot(X_test.iloc[i], model.coef_.T) + model.intercept_))))

[0.17958509]
[0.15970387]
[0.20180658]
[0.37010808]
[0.38363389]


In [15]:
# 代码汇总
# 1.读取数据
import pandas as pd
df = pd.read_excel('股票客户流失.xlsx')

# 2.划分特征变量和目标变量
X = df.drop(columns='是否流失') 
y = df['是否流失']

# 3.划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# 4.模型搭建
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

# 5.模型使用1 - 预测数据结果
y_pred = model.predict(X_test)
print(y_pred[0:100])  # 打印预测内容的前100个看看

# 查看全部的预测准确度
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print(score)  # 打印整体的预测准确度

# 6.模型使用2 - 预测概率
y_pred_proba = model.predict_proba(X_test)  
print(y_pred_proba[0:5])  # 打印前5个客户的分类概率

[0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1]
0.7977288857345636
[[0.82041491 0.17958509]
 [0.84029613 0.15970387]
 [0.79819342 0.20180658]
 [0.62989192 0.37010808]
 [0.61636611 0.38363389]]
