## 数据预处理

### 导入数据

In [10]:
import pandas as pd
from collections import Counter

df_train = pd.read_csv('data/train.csv') # 训练集
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304888 entries, 0 to 304887
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    304888 non-null  int64  
 1   Gender                304888 non-null  object 
 2   Age                   304888 non-null  int64  
 3   Driving_License       304010 non-null  float64
 4   Region_Code           304888 non-null  float64
 5   Previously_Insured    304888 non-null  int64  
 6   Vehicle_Age           304888 non-null  object 
 7   Vehicle_Damage        304888 non-null  object 
 8   Annual_Premium        304888 non-null  float64
 9   Policy_Sales_Channel  304007 non-null  float64
 10  Vintage               304888 non-null  int64  
 11  Response              304888 non-null  int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 27.9+ MB


### 数据格式处理（未升维）

In [11]:
# 数据格式处理
num_feat = ['Age', 'Vintage', 'Previously_Insured', 'Annual_Premium'] # 数值属性
nor_feat = ['Gender', 'Driving_License', 'Region_Code', 'Vehicle_Damage',  'Vehicle_Age', 'Policy_Sales_Channel'] # 标称属性

df_train['Gender'] = df_train['Gender'].map( {'Female': 0, 'Male': 1} ).astype(int)
df_train['Vehicle_Damage'] = df_train['Vehicle_Damage'].map( {'Yes': 1, 'No': 0} ).astype(int)
df_train['Region_Code'] = df_train['Region_Code'].astype(int)
df_train['Vehicle_Age'] = df_train['Vehicle_Age'].map( {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2} ).astype(int)
# df_train['Policy_Sales_Channel'] = df_train['Policy_Sales_Channel'].astype(int)
# df_train['Driving_License'] = df_train['Driving_License'].astype(int)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304888 entries, 0 to 304887
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    304888 non-null  int64  
 1   Gender                304888 non-null  int32  
 2   Age                   304888 non-null  int64  
 3   Driving_License       304010 non-null  float64
 4   Region_Code           304888 non-null  int32  
 5   Previously_Insured    304888 non-null  int64  
 6   Vehicle_Age           304888 non-null  int32  
 7   Vehicle_Damage        304888 non-null  int32  
 8   Annual_Premium        304888 non-null  float64
 9   Policy_Sales_Channel  304007 non-null  float64
 10  Vintage               304888 non-null  int64  
 11  Response              304888 non-null  int64  
dtypes: float64(3), int32(4), int64(5)
memory usage: 23.3 MB


In [None]:
# 缺失值检测处理：删除含有空值的行（标称属性）
df_train.dropna(axis=0, how='any', inplace=True)


In [None]:
# 缺失值检测处理：逻辑回归（标称属性）

# 划分缺失值行

list_null = []
list_notnull = []

null_pro = ['Driving_License']
for pro in null_pro:                        
    for index in range(len(df_train[pro])):
        if pd.isnull(df_train[pro][index]):
            list_null.append(df_train[index:index+1])
        else:
            list_notnull.append(df_train[index:index+1])
df_null = pd.concat(list_null)
df_notnull = pd.concat(list_notnull)



In [None]:
df_null.info()

In [None]:
# 划分data、label
train_dl = df_notnull["Driving_License"]
train_data = df_notnull.drop(['Response', 'id', 'Driving_License', 'Policy_Sales_Channel'], axis = 1)

test_data = df_null.drop(['Response', 'id', 'Driving_License', 'Policy_Sales_Channel'], axis = 1)

In [None]:
# 调用逻辑回归
from sklearn import linear_model
lr = linear_model.LogisticRegression()
lr.fit(train_data, train_dl)
predict_dl = lr.predict(test_data)

index_pdl = 0
for pro in null_pro:                        
    for index in range(len(df_train[pro])):
        if pd.isnull(df_train[pro][index]):
            df_train[pro][index] = predict_dl[index_pdl]
            index_pdl += 1

df_train.info()

In [12]:
# 缺失值检测处理：轮盘法按概率填充（标称属性）
import random

null_pro = ['Driving_License', 'Policy_Sales_Channel']
for pro in null_pro:
    countPro = Counter([x for x in df_train[pro] if pd.isnull(x) == False])
    sumPro = sum(countPro.values())
    for index in range(len(df_train[pro])):
        if pd.isnull(df_train[pro][index]):
            needle = random.random() 
            needle *= sumPro
            for cur in countPro.most_common():
                needle -= cur[1]
                if needle <= 0:
                    df_train[pro][index] = cur[0]
                    break
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304888 entries, 0 to 304887
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    304888 non-null  int64  
 1   Gender                304888 non-null  int32  
 2   Age                   304888 non-null  int64  
 3   Driving_License       304888 non-null  float64
 4   Region_Code           304888 non-null  int32  
 5   Previously_Insured    304888 non-null  int64  
 6   Vehicle_Age           304888 non-null  int32  
 7   Vehicle_Damage        304888 non-null  int32  
 8   Annual_Premium        304888 non-null  float64
 9   Policy_Sales_Channel  304888 non-null  float64
 10  Vintage               304888 non-null  int64  
 11  Response              304888 non-null  int64  
dtypes: float64(3), int32(4), int64(5)
memory usage: 23.3 MB


In [13]:
# 数据格式处理
df_train['Policy_Sales_Channel'] = df_train['Policy_Sales_Channel'].astype(int)
df_train['Driving_License'] = df_train['Driving_License'].astype(int)

In [14]:
# 划分数据
from sklearn.model_selection import train_test_split
train_id = df_train['id']
train_target = df_train['Response']
train = df_train.drop(['Response', 'id'], axis = 1)
x_train,x_test,y_train,y_test = train_test_split(train,train_target, test_size = 0.3,random_state = 0)
# x_test,x_left,y_test,y_left = train_test_split(x_left,y_left, test_size = 0.1,random_state = 0)



In [15]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213421 entries, 82054 to 117952
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                213421 non-null  int32  
 1   Age                   213421 non-null  int64  
 2   Driving_License       213421 non-null  int32  
 3   Region_Code           213421 non-null  int32  
 4   Previously_Insured    213421 non-null  int64  
 5   Vehicle_Age           213421 non-null  int32  
 6   Vehicle_Damage        213421 non-null  int32  
 7   Annual_Premium        213421 non-null  float64
 8   Policy_Sales_Channel  213421 non-null  int32  
 9   Vintage               213421 non-null  int64  
dtypes: float64(1), int32(6), int64(3)
memory usage: 13.0 MB


In [36]:
# 使用决策树简单尝试
from sklearn import tree
tree = tree.DecisionTreeClassifier()
tree.fit(x_train, y_train)

print(classification_report(y_test, tree.predict(x_test)))
                                                                        
print("模型训练集的准确率：%.3f" %tree.score(x_train, y_train))
print("模型测试集的准确率：%.3f" %tree.score(x_test, y_test))

Counter({0: 187239, 1: 26182})
Counter({0: 79592, 1: 11875})


In [29]:
# 使用Logistic
from sklearn import linear_model
from sklearn.metrics  import classification_report
lr = linear_model.LogisticRegression()
lr.fit(x_train, y_train)

# print(classification_report(y_train, ))
Counter(lr.predict(x_train))
# print("模型训练集的准确率：%.3f" %lr.score(x_train, y_train))
# print("模型测试集的准确率：%.3f" %lr.score(x_test, y_test))

Counter({0: 213421})