In [1]:
import pandas as pd
import plotly as py
#? 调用类中的函数
from exploratory_data_analysis import EDAnalysis
#? 进行可视化
pyplot = py.offline.plot
#? 读入训练集数据
train = pd.read_csv('./data/train.csv')
# train.head()
#? 读入测试集数据
test = pd.read_csv('./data/test.csv')
# test.head()
# print(train.info())
# print('-' * 50)
# print(test.info())
desc_table = train.drop(['id', 'Vehicle_Age'], axis=1).describe().T
# desc_table
#? 查看目标属性分布情况
train['Response'].value_counts()

0    334399
1     46710
Name: Response, dtype: int64

In [3]:

# #* 查看性别属性与目标属性关系
# pd.crosstab(train['Gender'], train['Response'])
# #? 实例类
eda = EDAnalysis(data=train, id_col='id', target='Response')
# #? 柱形图
# fig = eda.draw_bar_stack_cat(col_name='Gender')
# #* 查看是否有驾照属性与目标属性关系
# pd.crosstab(train['Driving_License'], train['Response'])
# fig = eda.draw_bar_stack_cat(col_name='Driving_License')

# #* 查看之前是否投保与目标属性关系
# pd.crosstab(train['Previously_Insured'], train['Response'])
# fig = eda.draw_bar_stack_cat(col_name='Previously_Insured')

# #* 查看车龄与目标属性关系
# pd.crosstab(train['Vehicle_Age'], train['Response'])
# fig = eda.draw_bar_stack_cat(col_name='Vehicle_Age')

# #* 查看车辆损坏情况与目标属性关系
# pd.crosstab(train['Vehicle_Damage'], train['Response'])
# fig = eda.draw_bar_stack_cat(col_name='Vehicle_Damage')

# #* 查看客户年龄与目标属性关系
# pd.crosstab(train['Age'], train['Response'])
# fig = eda.draw_bar_stack_num(col_name='Age')

#* 查看年度保费与目标属性关系
fig = eda.draw_bar_stack_num(col_name='Annual_Premium')
fig.show()

In [None]:

#! 删除字段
train = train.drop(['Region_Code', 'Policy_Sales_Channel'], axis=1)
#! 处理异常值
f_max = train['Annual_Premium'].mean() + 3*train['Annual_Premium'].std()
f_min = train['Annual_Premium'].mean() - 3*train['Annual_Premium'].std()
train.loc[train['Annual_Premium'] > f_max, 'Annual_Premium'] = f_max
train.loc[train['Annual_Premium'] < f_min, 'Annual_Premium'] = f_min
#! 数据编码
train['Gender'] = train['Gender'].map({'Male': 1, 'Female': 0})
train['Vehicle_Damage'] = train['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
train['Vehicle_Age'] = train['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})
train.head()

In [None]:

#! 删除字段
test = test.drop(['Region_Code', 'Policy_Sales_Channel'], axis=1)
#! 处理异常值
test.loc[test['Annual_Premium'] > f_max, 'Annual_Premium'] = f_max
test.loc[test['Annual_Premium'] < f_min, 'Annual_Premium'] = f_min
#! 数据编码
test['Gender'] = test['Gender'].map({'Male': 1, 'Female': 0})
test['Vehicle_Damage'] = test['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
test['Vehicle_Age'] = test['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})
test.head()

In [None]:

#@ 数据聚类建模分析
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
#? 预处理函数
from sklearn.preprocessing import MinMaxScaler
#? 模型评估
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
#? 处理样本不平衡，对0类样本进行降采样
from imblearn.under_sampling import RandomUnderSampler

In [None]:

#! 划分特征和标签
X = train.drop(['id', 'Response'], axis=1)
y = train['Response']
#! 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)                                                # type: ignore
under_model = RandomUnderSampler(sampling_strategy={0: 133759, 1: 37368}, random_state=0)                            # type: ignore
X_train, y_train = under_model.fit_resample(X_train, y_train)                                                       # type: ignore

mms = MinMaxScaler()

X_train_scaled = pd.DataFrame(mms.fit_transform(X_train), columns=X.columns)
#! 保存一份极值标准化的数据
X_val_scaled = pd.DataFrame(mms.transform(X_val), columns=X_val.columns)                                        # type: ignore

#! 测试集
X_test = test.drop(['ID'],axis=1)
X_test_scaled = pd.DataFrame(mms.transform(X_test), columns=X_test.columns)

In [None]:

#* 建立knn
knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_val_scaled)

print('Simple KNeighborsClassifier accuracy: %.3f' % (accuracy_score(y_val, y_pred)))
print('Simple KNeighborsClassifier f1_score: %.3f' % (f1_score(y_val, y_pred)))
print('Simple KNeighborsClassifier roc_auc_score: %.3f' % (roc_auc_score(y_val, y_pred)))

#* 对测试集评估
test_y = knn.predict(X_test_scaled)
test_y[:5]


In [None]:

#* 决策树
dtc = DecisionTreeClassifier(max_depth=10, random_state=0)
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_val)

print('Simple DecisionTreeClassifier accuracy: %.3f' % (accuracy_score(y_val, y_pred)))
print('Simple DecisionTreeClassifier f1_score: %.3f' % (f1_score(y_val, y_pred)))
print('Simple DecisionTreeClassifier roc_auc_score: %.3f' % (roc_auc_score(y_val, y_pred)))


In [None]:

#* 以f1为优化标准优化决策树算法
parameters = {
    'splitter': ('best', 'random'),
    'criterion':('gini', 'entropy'),
    'max_depth':[*range(1, 30, 2)],
}

#* 建立模型
clf = DecisionTreeClassifier(random_state=0)
GS = GridSearchCV(clf, parameters, cv=5, scoring='f1')
GS.fit(X_train, y_train)
#* 最佳模型
best_model = GS.best_estimator_

best_model.fit(X_train, y_train)                                                                                     #type:ignore
y_pred = best_model.predict(X_val)                                                                                  #type:ignore

print('Randomized  DecisionTree accuracy: %.3f' % (accuracy_score(y_val, y_pred)))
print('Randomized  DecisionTree f1_score: %.3f' % (f1_score(y_val, y_pred)))
print('Randomized  DecisionTree roc_auc_score: %.3f' % (roc_auc_score(y_val, y_pred)))

In [None]:

#* 属性重要性
imp = pd.DataFrame(zip(X_train.columns, best_model.feature_importances_), columns=['col_name', 'importance'])        # type: ignore
imp = imp.sort_values('importance', ascending=False)
imp['accumulative_importance'] = imp['importance'].cumsum()
imp = round(imp, 3)
imp