In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation, linear_model, metrics, preprocessing
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import feature_selection as fs
from sklearn.svm import LinearSVC



In [2]:
train = pd.read_csv("orange_small_churn_train_data.csv", index_col='ID')
y = train['labels']
test = pd.read_csv("orange_small_churn_test_data.csv", index_col='ID')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 0 to 39999
Columns: 231 entries, Var1 to labels
dtypes: float64(191), int64(2), object(38)
memory usage: 70.8+ MB


In [3]:
data = pd.concat([train.drop('labels', axis=1), test], ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Columns: 230 entries, Var1 to Var230
dtypes: float64(191), int64(1), object(38)
memory usage: 87.7+ MB


In [4]:
numbers = data.ix[:, 0:190]
names_to_drop = []
for name in numbers.columns:
    col = numbers[name].dropna()
    if len(col.unique()) < 2:
        names_to_drop.append(name)

numbers.drop(names_to_drop, axis=1, inplace=True)

In [5]:
def write_result(result, filename):
    churn_probs = [x[1] for x in result]
    ids = range(len(churn_probs))
    df  = pd.DataFrame(index=ids, data=churn_probs, columns=['result'])
    df.to_csv(filename, index_label = 'Id')

def test_dataset(train, y, test, filename):
    clf = GradientBoostingClassifier(n_estimators=200, random_state=10)
#     clf.fit(train, y)
#     result = clf.predict_proba(test)
#     write_result(result, filename)

In [6]:
try1 = numbers.copy()
try1.fillna(0, inplace=True)
try1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Columns: 173 entries, Var1 to Var190
dtypes: float64(172), int64(1)
memory usage: 66.0 MB


In [7]:
test_dataset(try1[0:40000], y, try1[40000:], "test1.csv")

In [8]:
try2 = numbers.copy()
for i in range(try2.shape[1]):
    col = try2.ix[:, i]
    mean = col.mean()
    col.fillna(mean, inplace=True)
try2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Columns: 173 entries, Var1 to Var190
dtypes: float64(172), int64(1)
memory usage: 66.0 MB


In [9]:
test_dataset(try2[0:40000], y, try2[40000:], "test2.csv")

In [10]:
try1.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var180,Var181,Var182,Var183,Var184,Var186,Var187,Var188,Var189,Var190
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.16128,0.0001,10.5474,0.00396,7101.714,1179.7597,6.05514,0.67596,11676.09,0.21392,...,53025.63,0.5502,44737.43,1930.346,0.21,0.04632,0.23228,4.154086,113.57856,146.566921
std,5.006245,0.022361,675.452356,0.227652,118222.1,2566.782817,6.33667,19.182295,173342.7,1.415471,...,631180.3,2.374479,474774.2,33978.67,7.513715,1.109937,7.391748,31.626171,144.718679,2970.063833
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.42,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,287.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,798.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,1316.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,246.0,0.0
max,680.0,5.0,130668.0,27.0,6048550.0,131761.0,140.0,2300.0,12325590.0,40.0,...,14284830.0,49.0,11994780.0,3048400.0,1200.0,102.0,910.0,628.62,642.0,230427.0


In [11]:
scaled1 = (try1 - try1.min()) / (try1.max() - try1.min())
scaled1.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var180,Var181,Var182,Var183,Var184,Var186,Var187,Var188,Var189,Var190
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.000237,2e-05,8.1e-05,0.000147,0.001174,0.008954,0.043251,0.000294,0.000947,0.005348,...,0.003712,0.011229,0.00373,0.000633,0.000175,0.000454,0.000255,0.016651,0.176914,0.000636
std,0.007362,0.004472,0.005169,0.008432,0.019546,0.019481,0.045262,0.00834,0.014064,0.035387,...,0.044185,0.048459,0.039582,0.011146,0.006261,0.010882,0.008123,0.049802,0.225419,0.012889
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.002178,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01011,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.006056,0.05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01011,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.009988,0.05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01011,0.383178,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
scaled2 = (try2 - try2.min()) / (try2.max() - try2.min())
scaled2.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var180,Var181,Var182,Var183,Var184,Var186,Var187,Var188,Var189,Var190
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.016893,0.000806,0.003255,0.004644,0.039479,0.010067,0.048639,0.020933,0.031853,0.019556,...,0.264389,0.012479,0.118105,0.025513,0.007051,0.032345,0.01818,0.273665,0.415318,0.095505
std,0.007089,0.00447,0.005144,0.008392,0.018359,0.019223,0.04261,0.007968,0.012981,0.014116,...,0.03138,0.048314,0.033766,0.010416,0.006165,0.010195,0.007836,0.028266,0.088399,0.010286
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.016893,0.000806,0.003255,0.004644,0.039479,0.004409,0.0,0.020933,0.031853,0.019556,...,0.264389,0.0,0.118105,0.025513,0.007051,0.032345,0.01818,0.273665,0.415318,0.095505
50%,0.016893,0.000806,0.003255,0.004644,0.039479,0.007172,0.05,0.020933,0.031853,0.019556,...,0.264389,0.0,0.118105,0.025513,0.007051,0.032345,0.01818,0.273665,0.415318,0.095505
75%,0.016893,0.000806,0.003255,0.004644,0.039479,0.010067,0.05,0.020933,0.031853,0.019556,...,0.264389,0.0,0.118105,0.025513,0.007051,0.032345,0.01818,0.273665,0.415318,0.095505
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
test_dataset(scaled1[0:40000], y, scaled1[40000:], "test3.csv")
test_dataset(scaled2[0:40000], y, scaled2[40000:], "test4.csv")

In [14]:
categors = data.ix[:, 190:230]
names_to_drop = []
for name in categors.columns:
    col = categors[name].dropna()
    if len(col.unique()) < 2:
        names_to_drop.append(name)

categors.drop(names_to_drop, axis=1, inplace=True)

categors.fillna(method='bfill', inplace=True)
categors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 34 columns):
Var192    50000 non-null object
Var193    50000 non-null object
Var194    49995 non-null object
Var195    50000 non-null object
Var196    50000 non-null object
Var197    50000 non-null object
Var198    50000 non-null object
Var199    50000 non-null object
Var200    50000 non-null object
Var201    49995 non-null object
Var202    50000 non-null object
Var203    50000 non-null object
Var204    50000 non-null object
Var205    50000 non-null object
Var206    50000 non-null object
Var207    50000 non-null object
Var208    50000 non-null object
Var210    50000 non-null object
Var211    50000 non-null object
Var212    50000 non-null object
Var214    50000 non-null object
Var216    50000 non-null object
Var217    50000 non-null object
Var218    50000 non-null object
Var219    50000 non-null object
Var220    50000 non-null object
Var221    50000 non-null object
Var222    50000 non-null ob

In [15]:
#преобразуем все признаки в числовые
le = preprocessing.LabelEncoder()
cat_as_nums = categors.apply(le.fit_transform)
cat_as_nums.head()

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


Unnamed: 0,Var192,Var193,Var194,Var195,Var196,Var197,Var198,Var199,Var200,Var201,...,Var219,Var220,Var221,Var222,Var223,Var225,Var226,Var227,Var228,Var229
0,161,5,2,22,0,0,2169,1447,3511,2,...,11,3129,0,4067,0,1,14,0,29,1
1,170,0,2,22,0,129,830,4076,3511,2,...,11,3138,4,379,0,1,22,2,2,1
2,115,17,2,22,0,113,2991,4015,10529,2,...,11,350,6,2751,0,1,7,3,25,1
3,333,25,2,22,0,2,4000,1804,10529,2,...,11,4232,4,2864,0,1,22,2,8,0
4,38,25,2,22,0,210,3331,2897,10529,2,...,11,1797,4,1506,0,1,12,2,8,0


In [16]:
nums1 = pd.concat([scaled1, cat_as_nums], axis=1)

cat_scaled = (cat_as_nums - cat_as_nums.min()) / (cat_as_nums.max() - cat_as_nums.min())
nums2 = pd.concat([scaled1, cat_scaled], axis=1)
nums2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Columns: 207 entries, Var1 to Var229
dtypes: float64(207)
memory usage: 79.0 MB


In [17]:
test_dataset(nums1[0:40000], y, nums1[40000:], "test5.csv")
test_dataset(nums2[0:40000], y, nums2[40000:], "test6.csv")

In [18]:
nums2.to_csv("data_prepared.csv", index_label = 'Id')

In [32]:
from imblearn import under_sampling as us

In [33]:
sampler = us.ClusterCentroids(random_state=10)
X_res2, y_res2 = sampler.fit_sample(nums2[0:40000], y)
(len(X_res2), len(y_res2))

(5952, 5952)

In [34]:
test_dataset(X_res2, y_res2, nums2[40000:], "test7.csv")

In [35]:
sfm = fs.SelectFromModel(LinearSVC(penalty="l1", dual=False))
X = sfm.fit_transform(nums2[0:40000], y)
X.shape

(40000, 98)

In [36]:
selected = nums2.ix[:, sfm.get_support()]
selected.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 98 columns):
Var5      50000 non-null float64
Var6      50000 non-null float64
Var7      50000 non-null float64
Var11     50000 non-null float64
Var14     50000 non-null float64
Var16     50000 non-null float64
Var19     50000 non-null float64
Var24     50000 non-null float64
Var27     50000 non-null float64
Var33     50000 non-null float64
Var34     50000 non-null float64
Var35     50000 non-null float64
Var38     50000 non-null float64
Var50     50000 non-null float64
Var51     50000 non-null float64
Var53     50000 non-null float64
Var54     50000 non-null float64
Var57     50000 non-null float64
Var59     50000 non-null float64
Var65     50000 non-null float64
Var72     50000 non-null float64
Var73     50000 non-null float64
Var76     50000 non-null float64
Var77     50000 non-null float64
Var78     50000 non-null float64
Var80     50000 non-null float64
Var81     50000 non-null float64


In [38]:
test_dataset(selected[0:40000], y, selected[40000:], "test8.csv")

In [39]:
clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.2, max_depth=5, loss='exponential', max_features=0.3, random_state=10)
clf.fit(nums2[0:40000], y)
result = clf.predict_proba(nums2[40000:])
write_result(result, "test9.csv")

In [40]:
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, random_state=10)
clf.fit(nums2[0:40000], y)
result = clf.predict_proba(nums2[40000:])
write_result(result, "test10.csv")

In [41]:
clf = RandomForestClassifier(class_weight={1: 5}, criterion='entropy', max_features=0.3, min_samples_leaf=20, n_estimators=200, n_jobs=4, random_state=10)
clf.fit(nums2[0:40000], y)
result = clf.predict_proba(nums2[40000:])
write_result(result, "test11.csv")

In [42]:
clf = RandomForestClassifier(class_weight={1: 40}, n_estimators=1000, n_jobs=4, random_state=10)
clf.fit(nums2[0:40000], y)
result = clf.predict_proba(nums2[40000:])
write_result(result, "test12.csv")

In [43]:
clf = GradientBoostingClassifier(n_estimators=200, random_state=10)
clf.fit(nums2[0:40000], y)
result = clf.predict_proba(nums2[40000:])
write_result(result, "test13.csv")