In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation, linear_model, metrics, preprocessing
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import feature_selection as fs
from sklearn.svm import LinearSVC



In [2]:
train = pd.read_csv("orange_small_churn_train_data.csv", index_col='ID')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 0 to 39999
Columns: 231 entries, Var1 to labels
dtypes: float64(191), int64(2), object(38)
memory usage: 70.8+ MB


In [3]:
y = train['labels']

везде далее буду проверять качество на двух метриках: roc_auc и average_precision

In [4]:
# заполним пропуски средним по столбцу
numbers = train.ix[:, 0:190]
names_to_drop = []
for name in numbers.columns:
    col = numbers[name].dropna()
    if len(col.unique()) < 2:
        names_to_drop.append(name)

numbers.drop(names_to_drop, axis=1, inplace=True)

numbers2 = numbers.copy()

In [5]:
for i in range(numbers.shape[1]):
    col = numbers.ix[:, i]
    mean = col.mean()
    col.fillna(mean, inplace=True)

clf = RandomForestClassifier(n_estimators=200, random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, numbers, y, scoring = metrics.scorer.roc_auc_scorer)
forest_scoring.mean()

0.66352751594313852

In [6]:
clf = RandomForestClassifier(n_estimators=200, random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, numbers, y, scoring = 'average_precision')
forest_scoring.mean()

0.14020025780786105

In [7]:
# заполним пропуски нулями
numbers2.fillna(0, inplace=True)

clf = RandomForestClassifier(n_estimators=200, random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, numbers2, y, scoring = metrics.scorer.roc_auc_scorer)
forest_scoring.mean()

0.66047584032504203

In [8]:
clf = RandomForestClassifier(n_estimators=200, random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, numbers2, y, scoring = 'average_precision')
forest_scoring.mean()

0.13561701242809795

In [9]:
clf = LinearSVC(random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, numbers, y, scoring = metrics.scorer.roc_auc_scorer)
forest_scoring.mean()

0.52564438743043163

In [10]:
clf = LinearSVC(random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, numbers2, y, scoring = metrics.scorer.roc_auc_scorer)
forest_scoring.mean()

0.50829508417560632

In [46]:
# проверил раньше, что запись в числах дает лучше результат, чем бинаризация на имеющихся категориях
cat_as_nums = pd.read_csv("categories_only_as_numbers.csv", index_col="ID")

nums = pd.concat([numbers, cat_as_nums], axis=1)
nums.head()

Unnamed: 0_level_0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var219,Var220,Var221,Var222,Var223,Var225,Var226,Var227,Var228,Var229
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,11.003509,0.00497,429.328358,0.148235,249055.515021,3052,6.792126,47.487719,381346.56309,8.549254,...,2742,0,101,0,2450,1333,154,117,88,1355
1,11.003509,0.00497,429.328358,0.148235,249055.515021,1813,7.0,47.487719,381346.56309,8.549254,...,2742,0,2407,0,2450,1333,132,2337,171,1355
2,11.003509,0.00497,429.328358,0.148235,249055.515021,1953,7.0,47.487719,381346.56309,8.549254,...,2742,138,277,138,2450,1333,607,211,85,1355
3,11.003509,0.00497,429.328358,0.148235,249055.515021,1533,7.0,47.487719,381346.56309,8.549254,...,2742,4,2407,4,2450,1333,132,2337,2270,1614
4,11.003509,0.00497,429.328358,0.148235,249055.515021,686,7.0,47.487719,381346.56309,8.549254,...,2742,0,2407,0,2450,1333,248,2337,2270,1614


In [47]:
sfm = fs.SelectFromModel(LinearSVC(penalty="l1", dual=False))
X = sfm.fit_transform(nums, y)
X.shape

(40000, 139)

In [48]:
clf = RandomForestClassifier(n_estimators=200, random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, nums, y, scoring = metrics.scorer.roc_auc_scorer)
forest_scoring.mean()

0.98129057849388923

In [49]:
clf = RandomForestClassifier(n_estimators=200, random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, nums, y, scoring = 'average_precision')
forest_scoring.mean()

0.75095882886968113

In [50]:
clf = RandomForestClassifier(n_estimators=200, random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, X, y, scoring = metrics.scorer.roc_auc_scorer)
forest_scoring.mean()

0.98126321490330959

In [51]:
clf = RandomForestClassifier(n_estimators=200, random_state=2)
forest_scoring = cross_validation.cross_val_score(clf, X, y, scoring = 'average_precision')
forest_scoring.mean()

0.74515586029220671

In [18]:
from sklearn import grid_search



In [23]:
clf = RandomForestClassifier(random_state=2)
params = {
    "class_weight": [{1: 10}, {1: 35}, {1: 40}, {1: 45}, {1: 60}],
    'max_depth' : [10, 20],
    'min_samples_leaf' : [1, 3, 5],
    'n_estimators' : [200],
}
grid_cv = grid_search.GridSearchCV(clf, params, scoring = metrics.scorer.roc_auc_scorer, n_jobs=4)

In [24]:
%%time
grid_cv.fit(X, y)

CPU times: user 17.2 s, sys: 332 ms, total: 17.6 s
Wall time: 5min 46s


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=2,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [200], 'min_samples_leaf': [1, 3, 5], 'max_depth': [10, 20], 'class_weight': [{1: 10}, {1: 35}, {1: 40}, {1: 45}, {1: 60}]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(roc_auc_score, needs_threshold=True), verbose=0)

In [25]:
print grid_cv.best_score_
print grid_cv.best_params_

0.788524223264
{'n_estimators': 200, 'min_samples_leaf': 5, 'max_depth': 20, 'class_weight': {1: 10}}


In [26]:
clf = RandomForestClassifier(random_state=2)
params = {
    "class_weight": [{1: 5}, {1: 10}, {1: 15}, 'balanced'],
    'min_samples_leaf' : [4, 5, 7, 10],
    'n_estimators' : [200]
}
grid_cv = grid_search.GridSearchCV(clf, params, scoring = metrics.scorer.roc_auc_scorer, n_jobs=4)

In [27]:
%%time
grid_cv.fit(X, y)

CPU times: user 14 s, sys: 300 ms, total: 14.3 s
Wall time: 3min


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=2,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [200], 'min_samples_leaf': [4, 5, 7, 10], 'class_weight': [{1: 5}, {1: 10}, {1: 15}, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(roc_auc_score, needs_threshold=True), verbose=0)

In [28]:
print grid_cv.best_score_
print grid_cv.best_params_

0.792869352175
{'n_estimators': 200, 'min_samples_leaf': 10, 'class_weight': {1: 5}}


In [29]:
clf = RandomForestClassifier(random_state=2)
params = {
    "class_weight": [{1: 1}, {1: 3}, {1: 5}, {1: 7}],
    'min_samples_leaf' : [9, 10, 15, 25],
    'n_estimators' : [200]
}
grid_cv = grid_search.GridSearchCV(clf, params, scoring = metrics.scorer.roc_auc_scorer, n_jobs=4)

In [30]:
%%time
grid_cv.fit(X, y)

CPU times: user 14.8 s, sys: 280 ms, total: 15.1 s
Wall time: 2min 55s


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=2,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [200], 'min_samples_leaf': [9, 10, 15, 25], 'class_weight': [{1: 1}, {1: 3}, {1: 5}, {1: 7}]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(roc_auc_score, needs_threshold=True), verbose=0)

In [31]:
print grid_cv.best_score_
print grid_cv.best_params_

0.793297690246
{'n_estimators': 200, 'min_samples_leaf': 15, 'class_weight': {1: 5}}


In [32]:
clf = RandomForestClassifier(random_state=2)
params = {
    "class_weight": [{1: 5}],
    'min_samples_leaf' : [15, 20],
    'n_estimators' : [200],
    'criterion' : ['gini', 'entropy'],
    'max_features' : [0.3, 0.6, 0.7, 0.8]
}
grid_cv = grid_search.GridSearchCV(clf, params, scoring = metrics.scorer.roc_auc_scorer, n_jobs=4)

In [33]:
%%time
grid_cv.fit(X, y)

CPU times: user 32.1 s, sys: 512 ms, total: 32.6 s
Wall time: 12min 26s


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=2,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [200], 'max_features': [0.3, 0.6, 0.7, 0.8], 'min_samples_leaf': [15, 20], 'criterion': ['gini', 'entropy'], 'class_weight': [{1: 5}]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(roc_auc_score, needs_threshold=True), verbose=0)

In [34]:
print grid_cv.best_score_
print grid_cv.best_params_

0.800179854004
{'max_features': 0.3, 'n_estimators': 200, 'min_samples_leaf': 20, 'criterion': 'entropy', 'class_weight': {1: 5}}


In [57]:
test = pd.read_csv("orange_small_churn_test_data.csv", index_col='ID')

test_nums = test.ix[:, 0:190]
names_to_drop = []
for name in test_nums.columns:
    col = test_nums[name].dropna()
    if len(col.unique()) < 2:
        names_to_drop.append(name)

#test_nums.drop(names_to_drop, axis=1, inplace=True)

for i in range(test_nums.shape[1]):
    col = test_nums.ix[:, i]
    mean = col.mean()
    col.fillna(mean, inplace=True)
    
test_nums.head()

Unnamed: 0_level_0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var181,Var182,Var183,Var184,Var185,Var186,Var187,Var188,Var189,Var190
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13.575758,0,408.06383,0.029605,201664.611801,1225.0,7.0,,50.984848,433341.195652,...,0.0,1480428.730263,65347.429787,12.919149,,2.363636,18.651515,177.043915,268.38565,19505.399254
1,13.575758,0,408.06383,0.029605,201664.611801,896.0,14.0,,50.984848,433341.195652,...,0.0,1480428.730263,65347.429787,12.919149,,2.363636,18.651515,177.043915,268.38565,19505.399254
2,13.575758,0,408.06383,0.029605,201664.611801,791.0,7.0,,50.984848,433341.195652,...,14.0,1480428.730263,65347.429787,12.919149,,2.363636,18.651515,177.043915,268.38565,19505.399254
3,13.575758,0,408.06383,0.029605,201664.611801,2296.0,7.0,,50.984848,433341.195652,...,0.0,1480428.730263,65347.429787,12.919149,,2.363636,18.651515,177.043915,268.38565,19505.399254
4,8.0,0,408.06383,0.029605,201664.611801,1352.480799,6.87889,,28.0,433341.195652,...,0.614744,1480428.730263,65347.429787,12.919149,,0.0,2.0,177.043915,268.38565,19505.399254


In [70]:
test_cats = test.ix[:, 190:230]
names_to_drop = []
for name in test_cats.columns:
    col = test_cats[name].dropna()
    if len(col.unique()) < 2:
        names_to_drop.append(name)

test_cats[names_to_drop] = test_cats[names_to_drop].fillna(0)

test_cats.fillna(method='bfill', inplace=True)
test_cats.head()

Unnamed: 0_level_0,Var191,Var192,Var193,Var194,Var195,Var196,Var197,Var198,Var199,Var200,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,P1WvyxLp3Z,2Knk1KF,SEuy,taul,1K8T,0Xwj,PHNvXy8,xUOdRmdN20,IBm9AvG,...,zCkv,APgdzOv,jySVZNlOJy,0,ELof,xb3V,6fzt,Zy3gnGM,mj86,0
1,0,bZkvyxLkBI,RO12,SEuy,taul,1K8T,0Xwj,6KF0k8W,V4E_TU9097,JnrRQD4,...,oslk,IIvC99a,LM8l689qOp,0,kG3k,xb3V,RAYp,F2FyR07IdsN7I,mj86,0
2,0,75lTmBtFkL,RO12,SEuy,taul,1K8T,AnrR,ckoNVBU,_jTP8ioIlJ,JnrRQD4,...,oslk,6YSocsg,LM8l689qOp,0,kG3k,rgKb,RAYp,F2FyR07IdsN7I,mj86,0
3,0,YddTmBtueT,RO12,SEuy,taul,1K8T,487l,77f44U8,II0S8f9,a1lFLoc,...,oslk,5nQ7A2G,jySVZNlOJy,0,kG3k,rgKb,RAYp,F2FyR07IdsN7I,am7c,0
4,0,mCGq9ayE15,RO12,SEuy,taul,1K8T,lK27,esxkA1P,767sa0XN9l,pbHkKHa,...,oslk,MI8s5nE,LM8l689qOp,0,xG3x,7P5s,RAYp,F2FyR07IdsN7I,mj86,0


In [71]:
#преобразуем все признаки в числовые
cat_copy = test_cats.copy()
for name in cat_copy.columns:
    counts = {}
    pairs = zip(cat_copy[name].values, y)
    for x in pairs:
        if counts.get(x[0]) == None:
                counts[x[0]] = 0
        if x[1] == 1:
            counts[x[0]] += 1
    cat_copy[name] = cat_copy[name].apply(lambda x: counts[x])

cat_copy.head()

Unnamed: 0_level_0,Var191,Var192,Var193,Var194,Var195,Var196,Var197,Var198,Var199,Var200,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,762,4,112,760,739,769,74,18,0,0,...,108,18,119,753,336,35,61,15,334,771
1,762,1,531,760,739,769,74,3,2,0,...,558,3,608,753,374,35,529,486,334,771
2,762,4,531,760,739,769,6,1,14,0,...,558,1,608,753,374,25,529,486,334,771
3,762,3,531,760,739,769,63,1,5,1,...,558,1,119,753,374,25,529,486,434,771
4,762,0,531,760,739,769,63,1,0,0,...,558,1,608,753,61,34,529,486,334,771


In [72]:
test_obj = pd.concat([test_nums, cat_copy], axis=1)


In [None]:
clf = RandomForestClassifier( class_weight={1: 5}, criterion='entropy', max_features=0.3, min_samples_leaf=20, n_estimators=200, n_jobs=4, random_state=2)
clf.fit(X, y)

In [52]:
len(sfm.get_support())

207

In [54]:
indexes = nums.ix[:, sfm.get_support()]
indexes.head()

Unnamed: 0_level_0,Var1,Var4,Var7,Var9,Var11,Var12,Var14,Var16,Var17,Var18,...,Var214,Var216,Var217,Var220,Var221,Var222,Var225,Var226,Var227,Var229
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,11.003509,0.148235,6.792126,47.487719,8.549254,16.396552,0.694527,119.681478,10.568627,6.517647,...,0,11,0,0,101,0,1333,154,117,1355
1,11.003509,0.148235,7.0,47.487719,8.549254,16.396552,0.694527,119.681478,10.568627,6.517647,...,0,4,0,0,2407,0,1333,132,2337,1355
2,11.003509,0.148235,7.0,47.487719,8.549254,16.396552,0.694527,119.681478,10.568627,6.517647,...,1,0,2,138,277,138,1333,607,211,1355
3,11.003509,0.148235,7.0,47.487719,8.549254,16.396552,0.694527,119.681478,10.568627,6.517647,...,1,243,1,4,2407,4,1333,132,2337,1614
4,11.003509,0.148235,7.0,47.487719,8.549254,16.396552,0.694527,119.681478,10.568627,6.517647,...,1,39,0,0,2407,0,1333,248,2337,1614


In [63]:
to_predict = test_obj[indexes.columns]
result = clf.predict_proba(to_predict)
result[:5]

array([[ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.16559715,  0.83440285],
       [ 1.        ,  0.        ]])

In [64]:
def write_result(result, filename):
    churn_probs = [x[1] for x in result]
    ids = range(len(churn_probs))
    df  = pd.DataFrame(index=ids, data=churn_probs, columns=['result'])
    df.to_csv(filename)

In [65]:
write_result(result, "result6.csv")

In [73]:
clf = RandomForestClassifier( class_weight={1: 5}, criterion='entropy', max_features=0.3, min_samples_leaf=20, n_estimators=200, n_jobs=4, random_state=2)
clf.fit(nums, y)

RandomForestClassifier(bootstrap=True, class_weight={1: 5},
            criterion='entropy', max_depth=None, max_features=0.3,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=4,
            oob_score=False, random_state=2, verbose=0, warm_start=False)

In [74]:
to_predict = test_obj[nums.columns]
result = clf.predict_proba(to_predict)
result[:5]

array([[ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.18781143,  0.81218857],
       [ 1.        ,  0.        ]])

In [75]:
write_result(result, "result7.csv")