In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation, linear_model, metrics

Загрузим данные без отложенной выборки

In [2]:
data = pd.read_csv("orange_churn_data.train")
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 0 to 29999
Columns: 231 entries, Var1 to labels
dtypes: float64(191), int64(2), object(38)
memory usage: 53.1+ MB


Подготовим данные к построению модели

In [3]:
features = data.drop('labels', axis=1)

In [4]:
names_to_drop = []
for name in features.columns:
    col = features[name].dropna()
    if len(col.unique()) < 2:
        names_to_drop.append(name)

names_to_drop

['Var8',
 'Var15',
 'Var20',
 'Var31',
 'Var32',
 'Var39',
 'Var42',
 'Var48',
 'Var52',
 'Var55',
 'Var79',
 'Var118',
 'Var141',
 'Var167',
 'Var169',
 'Var175',
 'Var185',
 'Var191',
 'Var209',
 'Var213',
 'Var215',
 'Var224',
 'Var230']

удалим малозначащие колонки и в оставшихся заполним пропуски нулями (известно, что среди категорий нигде нет значения 0)

In [5]:
features.drop(names_to_drop, axis=1, inplace=True)
features.fillna(0, inplace=True)
features.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var219,Var220,Var221,Var222,Var223,Var225,Var226,Var227,Var228,Var229
0,0,0,0,0,0,385,0,0,0,0,...,0,eNCmIK9,oslk,xhoMnus,0,0,szEZ,RAYp,F2FyR07IdsN7I,0
1,0,0,0,0,0,784,7,0,0,0,...,qxDb,Z6VfSEp,Al6ZaUT,503RDbC,LM8l689qOp,kG3k,Qcbd,RAYp,iyHGyLCEkQ,0
2,0,0,0,0,0,847,7,0,0,0,...,FzaX,UjGuLpj,oslk,3QPw991,LM8l689qOp,0,Qu4f,RAYp,F2FyR07IdsN7I,0
3,0,0,0,0,0,1078,21,0,0,0,...,FzaX,XbZitea,zCkv,DHPNgqU,jySVZNlOJy,kG3k,7aLG,6fzt,SbOd7O8ky1wGNxp0Arj0Xs,mj86
4,0,0,0,0,0,686,7,0,0,0,...,FzaX,l3KfrxM,oslk,0Ya7A2G,LM8l689qOp,0,Aoh3,RAYp,F2FyR07IdsN7I,0


Поработаем с категориальными данными

In [6]:
cat_start = np.where(features.columns.values == 'Var190')[0][0]
cat_end = len(features.columns)
(cat_start, cat_end)

(172, 207)

In [7]:
categors = features.iloc[:, cat_start:cat_end]
sum([len(categors[name].unique()) for name in categors.columns])

54335

In [8]:
big_cats = [(name,len(categors[name].unique())) for name in categors.columns if len(categors[name].unique()) > 500]
big_cats

[('Var198', 3413),
 ('Var199', 3663),
 ('Var200', 10884),
 ('Var202', 5281),
 ('Var214', 10884),
 ('Var216', 1583),
 ('Var217', 10621),
 ('Var220', 3413),
 ('Var222', 3413)]

Видим, что есть признаки с очень большим количеством категорий. Причем признаки с наибольшим количеством категорий показали наибольшую корреляцию ранее. Но для baseline решения пока что просто откажется от признаков с большим количеством категорий

In [9]:
dummies = pd.get_dummies(categors.drop([x[0] for x in big_cats], axis=1))
dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 0 to 29999
Columns: 974 entries, Var190 to Var229_sk2h
dtypes: float64(974)
memory usage: 223.2 MB


In [10]:
dummies.head()

Unnamed: 0,Var190,Var192_0,Var192_0G9vyxdMWg,Var192_0kQTmBU3gb,Var192_0kQqrQsiZt,Var192_0vimfo8zhV,Var192_1GdOj17ejg,Var192_1GdOj1KXzC,Var192_1JGTmBQZiT,Var192_1JGqrQKzJV,...,Var228_d0LtHjWeaXyArdN4sxU_saXqH,Var228_ib5G6X1eUxUn6,Var228_iyHGyLCEkQ,Var228_r_7E,Var228_xwM2aC7IdeMC0,Var229_0,Var229_am7c,Var229_mj86,Var229_oJmt,Var229_sk2h
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [11]:
objects = features.iloc[:, 0:cat_start]
objects = pd.concat([objects, dummies], axis=1)
objects.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var228_d0LtHjWeaXyArdN4sxU_saXqH,Var228_ib5G6X1eUxUn6,Var228_iyHGyLCEkQ,Var228_r_7E,Var228_xwM2aC7IdeMC0,Var229_0,Var229_am7c,Var229_mj86,Var229_oJmt,Var229_sk2h
0,0,0,0,0,0,385,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,784,7,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,0,0,0,0,0,847,7,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1078,21,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,686,7,0,0,0,...,0,0,0,0,0,1,0,0,0,0


Теперь перейдем к непосредственному построению модели. Основную метрику возьмем AUC-PRC, оценку качества сделаем с помощью кросс-валидации

In [12]:
y = data['labels']
cv_strategy = cross_validation.StratifiedKFold(y, random_state = 2)
ridge_scoring = cross_validation.cross_val_score(linear_model.RidgeClassifier(), objects, y, scoring = 'average_precision', cv = cv_strategy)
ridge_scoring

array([ 0.14209798,  0.12588887,  0.13919403])

In [13]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [14]:
forest_scoring = cross_validation.cross_val_score(RandomForestClassifier(), objects, y, scoring = 'average_precision', cv = cv_strategy)
crad_scoring = cross_validation.cross_val_score(GradientBoostingClassifier(), objects, y, scoring = 'average_precision', cv = cv_strategy)
forest_scoring

array([ 0.12092705,  0.11040664,  0.1029059 ])

In [15]:
crad_scoring

array([ 0.21298658,  0.22283504,  0.2071597 ])

In [16]:
print "Ridge classifier base score: ", ridge_scoring.mean()
print "Random forest classifier base score: ", forest_scoring.mean()
print "Gradient boosting classifier base score: ", crad_scoring.mean()

Ridge classifier base score:  0.135726959485
Random forest classifier base score:  0.111413196899
Gradient boosting classifier base score:  0.21432710668


Видим, что качество получилось ужасно слабое на наших моделях.
Составим обзорную табличку для разных стратегий и различных параметров, чтобы посмотреть, как оно меняется.

In [17]:
scorers = ['average_precision', metrics.scorer.f1_scorer, metrics.scorer.accuracy_scorer, metrics.scorer.precision_scorer, metrics.scorer.recall_scorer, \
           metrics.scorer.roc_auc_scorer]
folds = [3, 4, 5, 7, 10]

In [18]:
score_names = ['AUC-PRC', 'F1', 'Acc', 'Precis', 'Recall', 'ROC']
table = pd.DataFrame(columns=score_names)
table

Unnamed: 0,AUC-PRC,F1,Acc,Precis,Recall,ROC


In [19]:
rows = []
print "Ridge classifier:"
for fold_num in folds:
    cv_str = cross_validation.StratifiedKFold(y, random_state = 2, n_folds=fold_num)
    row = []
    for scor in scorers:
        result = cross_validation.cross_val_score(linear_model.RidgeClassifier(), objects, y, scoring = scor, cv = cv_str)
        row.append(result.mean())
    rows.append(row)
table = pd.DataFrame(np.array(rows), columns=score_names)
table.index = folds
table

Ridge classifier:


Unnamed: 0,AUC-PRC,F1,Acc,Precis,Recall,ROC
3,0.135727,0.015821,0.925367,0.446759,0.008065,0.633292
4,0.138183,0.015766,0.925233,0.440812,0.008065,0.635717
5,0.13734,0.020083,0.925467,0.461859,0.010305,0.637966
7,0.141931,0.020164,0.925633,0.529252,0.010303,0.643937
10,0.14204,0.02099,0.925733,0.539722,0.010758,0.642461


In [20]:
rows = []
print "Logistic regression classifier:"
for fold_num in folds:
    cv_str = cross_validation.StratifiedKFold(y, random_state = 2, n_folds=fold_num)
    row = []
    for scor in scorers:
        result = cross_validation.cross_val_score(linear_model.LogisticRegression(), objects, y, scoring = scor, cv = cv_str)
        row.append(result.mean())
    rows.append(row)
table = pd.DataFrame(np.array(rows), columns=score_names)
table.index = folds
table

Logistic regression classifier:


Unnamed: 0,AUC-PRC,F1,Acc,Precis,Recall,ROC
3,0.086963,0.033336,0.916967,0.124202,0.019265,0.540772
4,0.087676,0.033382,0.917,0.124941,0.019265,0.543011
5,0.087403,0.034789,0.9168,0.127173,0.020159,0.542892
7,0.088865,0.034086,0.9169,0.127013,0.019713,0.544032
10,0.088174,0.036414,0.916933,0.135707,0.021058,0.538644


In [21]:
print "Random forest classifier:"
rows = []
for fold_num in folds:
    cv_str = cross_validation.StratifiedKFold(y, random_state = 2, n_folds=fold_num)
    row = []
    for scor in scorers:
        result = cross_validation.cross_val_score(RandomForestClassifier(), objects, y, scoring = scor, cv = cv_str)
        row.append(result.mean())
    rows.append(row)
table = pd.DataFrame(np.array(rows), columns=score_names)
table.index = folds
table

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Random forest classifier:


Unnamed: 0,AUC-PRC,F1,Acc,Precis,Recall,ROC
3,0.121775,0.005322,0.925633,0.224242,0.001792,0.585742
4,0.118918,0.007083,0.925533,0.229167,0.00448,0.572952
5,0.108963,0.004429,0.925433,0.183333,0.002241,0.584131
7,0.117147,0.008868,0.925267,0.232143,0.002242,0.589793
10,0.116088,0.008838,0.925667,0.143333,0.00627,0.597411


In [22]:
rows = []
folds_min = [3, 5, 8]
print "Gradient boosting classifier:"
for fold_num in folds_min:
    cv_str = cross_validation.StratifiedKFold(y, random_state = 2, n_folds=fold_num)
    row = []
    for scor in scorers:
        result = cross_validation.cross_val_score(GradientBoostingClassifier(), objects, y, scoring = scor, cv = cv_str)
        row.append(result.mean())
    rows.append(row)
table = pd.DataFrame(np.array(rows), columns=score_names)
table.index = folds_min
table

Gradient boosting classifier:


Unnamed: 0,AUC-PRC,F1,Acc,Precis,Recall,ROC
3,0.211896,0.033925,0.9261,0.559147,0.017473,0.734462
5,0.21236,0.034781,0.926167,0.589283,0.018368,0.738851
8,0.215422,0.038897,0.926167,0.600339,0.020161,0.737432


Из данных табличек видим, что основное ухудшение качества дает именно Recall. Нужно будет обратить на него внимание в дальнейшем.