1.导入需要用到的常用库

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

2.加载数据集

In [4]:
#load data
data=pd.read_csv('organics.csv')


In [5]:
data.head()

Unnamed: 0,CUSTID,GENDER,DOB,EDATE,AGE,AGEGRP1,AGEGRP2,TV_REG,NGROUP,NEIGHBORHOOD,LCDATE,ORGANICS,BILL,REGION,CLASS,AFFL,LTIME
0,140,U,1921-09-16,1998-02-23,76.0,60-80,70-80,Wales & West,C,16.0,1994-11-07,0,16000.0,Midlands,Gold,10.0,4.0
1,620,U,1949-02-12,1998-02-23,49.0,40-60,40-50,Wales & West,D,35.0,1993-06-04,0,6000.0,Midlands,Gold,4.0,5.0
2,868,F,1927-11-27,1998-02-23,70.0,60-80,70-80,Wales & West,D,27.0,1990-08-02,1,0.02,Midlands,Silver,5.0,8.0
3,1120,M,1932-04-10,1998-02-23,65.0,60-80,60-70,Midlands,F,51.0,1991-07-01,1,0.01,Midlands,Tin,10.0,7.0
4,2313,F,1929-05-21,1998-02-23,68.0,60-80,60-70,Midlands,A,4.0,1990-03-01,0,0.01,Midlands,Tin,11.0,8.0


3.数据预处理
    清洗数据的4C准则：
    Correcting: 处理异常值
    Completing: 处理缺失值
    对于定性数据，一般用众数替代缺失值
    对于定量数据，一般用均值、中位数或均值+标准差替代缺失值
    删除不影响分析挖掘的特征
    Creating: 特征工程。创建新的特征，挖掘隐藏的信息。
    Converting: 转换数据格式。例如，将分类型数据进行编码，便于数学计算。

In [8]:
data.describe()

Unnamed: 0,CUSTID,AGE,NEIGHBORHOOD,ORGANICS,BILL,AFFL,LTIME
count,22223.0,20715.0,21549.0,22223.0,22223.0,21138.0,21942.0
mean,26055400.0,53.797152,27.193652,0.29474,4420.590041,8.711893,6.56467
std,15074970.0,13.206048,15.751547,0.562831,7559.047522,3.421125,4.657113
min,140.0,18.0,1.0,0.0,0.01,0.0,0.0
25%,11694020.0,44.0,14.0,0.0,0.01,6.0,4.0
50%,28748790.0,54.0,27.0,0.0,2000.0,8.0,5.0
75%,37454020.0,64.0,38.0,0.0,6000.0,11.0,8.0
max,52856470.0,79.0,55.0,3.0,296313.85,34.0,39.0


In [9]:
# Fill empty and NaNs values with NaN
data = data.fillna(np.nan)
# Check for Null values
data.isnull().sum()

CUSTID             0
GENDER          2512
DOB                0
EDATE              0
AGE             1508
AGEGRP1         1508
AGEGRP2         1508
TV_REG           465
NGROUP           674
NEIGHBORHOOD     674
LCDATE           281
ORGANICS           0
BILL               0
REGION           465
CLASS              0
AFFL            1085
LTIME            281
dtype: int64

In [None]:
#缺失值填充

data.GENDER[data.GENDER.isnull()]=data.GENDER.dropna().mode().values  #gender众数填充缺失值
data.loc[data['GENDER'] == 'F','GENDER'] = 0    
data.loc[data['GENDER'] == 'M','GENDER'] = 1   
data.loc[data['GENDER'] == 'U','GENDER'] = 2 

data['AGE']=(pd.to_datetime(data['EDATE'])-pd.to_datetime(data['DOB'])).dt.days.apply(lambda x:math.floor(x/365)) #
def split_Age1(x):
    if x>=0 and x<=20:
        return 0
    elif x<=40:
        return 1
    elif x<=60:
        return 2
    elif x<=80:
        return 3
    elif x<=100:
        return 4
    
def split_Age2(x):
    if x>=0 and x<=10:
        return 0
    elif x<=20:
        return 1
    elif x<=30:
        return 2
    elif x<=40:
        return 3
    elif x<=50:
        return 4
    elif x<=60:
        return 5
    elif x<=70:
        return 6
    elif x<=80:
        return 7
    elif x<=90:
        return 8
    elif x<=100:
        return 9
data['AGEGRP1']=df.AGE.apply(split_Age1)
data['AGEGRP2']=df.AGE.apply(split_Age2)
data.TV_REG[data.TV_REG.isnull()]=data.TV_REG.dropna().mode().values  #TV_REG
data.NGROUP[data.NGROUP.isnull()]=data.NGROUP.dropna().mode().values
data.NEIGHBORHOOD[data.NEIGHBORHOOD.isnull()]=data.NEIGHBORHOOD.dropna().mode().values

data.LCDATE[data.LCDATE.isnull()]=data.LCDATE.dropna().mode().values #日期格式

data.REGION[data.REGION.isnull()]=data.REGION.dropna().mode().values

data.AFFL[data.AFFL.isnull()]=data.AFFL.dropna().mean().values
data.LTIME[data.LTIME.isnull()]=data.LTIME.dropna().mean().values

In [None]:
#异常值填充


In [None]:
data.isnull().sum()

4.模型训练
    

In [None]:
# 从数据集中移除 'ORGYN' 这个特征，并将它存储在一个新的变量中。
labels = data['ORGYN']
features = data.drop('ORGYN', axis = 1)
#数据切分
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)



In [None]:
#GridSearchCV  逻辑回归
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)  #cv默认为3
clf.fit(X_train, y_train)  # default(X_train  y_train) or optimal(X_train['agegroup'])
#输出最优的模型参数
print(clf.best_params_)
#选择出特征权值比较大的feature

#rfe来筛选特征
estimator=LogisticRegression()
selector = RFE(estimator, 5, step=1)
selector = selector.fit(X_train, y_train)
#模型预测比较（all  inputs  or  optimal inputs  可以使用准确率比较哪个效果更好）
#应该是optimal  inputs效果比较好，因为all inputs 噪音比较大

In [None]:
#随机森林
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 
param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_)

In [None]:
#mlp
param_grid = {
        'hidden_layer_sizes': [(7, 7), (128,), (128, 7)],
        'tol': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
        'epsilon': [1e-3, 1e-7, 1e-8, 1e-9, 1e-8]
    }
estimator = GridSearchCV(
        MLPClassifier(learning_rate='adaptive', learning_rate_init=1., early_stopping=True, shuffle=True),
        param_grid=param_grid, n_jobs=-1)
estimator.fit(X_train, y_train)
print(estimator.best_estimator_)
