# BigDataSchool competition
by Sarana Maksym

## Reading the data

In [1]:
import pandas as pd
import numpy as np

In [5]:
raw_training_data = pd.read_csv('DATA/Xtrain.csv')
raw_test_data = pd.read_csv('DATA/Xtest.csv')
raw_columns_description = pd.read_excel('DATA/columns_description.xlsx')

raw_training_data.shape

(8532, 42)

Посмотрим на типы данных по столбцам:

In [3]:
pd.DataFrame([raw_training_data.dtypes, raw_test_data.dtypes], index=['Train','Test'], 
             columns=raw_training_data.columns).transpose()

Unnamed: 0,Train,Test
SUBS_ID,object,object
AVG_CHARGE_3M_GROUP,int64,int64
V_OUT_INT_DUR_3M,float64,float64
V_IN_INT_DUR_3M,float64,float64
V_OUT_INT_CNT_3M,float64,float64
V_IN_INT_CNT_3M,float64,float64
V_OUT_DUR_3M,float64,float64
V_IN_DUR_3M,float64,float64
V_OUT_CNT_3M,float64,float64
V_IN_CNT_3M,float64,float64


Кроме ключа только два поля нечисловые - HANDSET_VENDOR_CVAL и HANDSET_TYPE. Посмотрим, какие значения они принимают:

In [4]:
cathegorical = {a: raw_training_data[a].append(raw_test_data[a]).unique() for a in ['HANDSET_VENDOR_CVAL', 'HANDSET_TYPE']}
cathegorical

{'HANDSET_TYPE': array(['26c96577', '14cec58a', 'b698bfae', '76f9033c', '7e04d707',
        'cdc6dcbd', '3ec4d8ee'], dtype=object),
 'HANDSET_VENDOR_CVAL': array(['13f85d7f', '1e12951', '3ff121', '4f263988', '9d739b49', 'ec450b08',
        'ec10fb23', 'cfe6ac95', '647c1b8c', 'e3507dbf', '207c39a',
        'c28ff45a', 'f3bbf1aa', '4c7c5da3', '8fd12e90', '371db360',
        'efe57266', 'b698bfae', '1ad12908', '8464191f', '6bf8b75c',
        '334e015c', 'baf02cc4', '5a082af2', 'ccac2410', '58951489',
        '315b32bb', '4961ced5', 'c66a35f4', '498a95f4', '7dfafc69',
        'f64d7f38', '54237711', '8ce2d777', 'f584ffc9', '62e9acb5',
        '8f952bca', 'b1ce6e6a', '86ea82ab', 'de9dcac1', '8ad74ffa',
        '4bc753c7', '994471d3', '7fccba87', '7079d5a7', '8a677fe0',
        'c74e833c', '1e907434', 'bea560e3', '98605eec', 'b0d4070f',
        'fe9e5a9f', '7d629f7c', 'bd71c715', '5adde92d', '575e71f5',
        'fff5c034', 'c6c85eb6', 'ddef815d', 'e5759c9e', '5414be82',
        'eb760cc6', '

Теперь посмотрим на пустые значения

In [5]:
pd.DataFrame([raw_training_data.isnull().sum(), raw_test_data.isnull().sum()], index=['Train','Test'], 
             columns=raw_training_data.columns).transpose()

Unnamed: 0,Train,Test
SUBS_ID,0,0.0
AVG_CHARGE_3M_GROUP,0,0.0
V_OUT_INT_DUR_3M,1,1.0
V_IN_INT_DUR_3M,1,1.0
V_OUT_INT_CNT_3M,1,1.0
V_IN_INT_CNT_3M,1,1.0
V_OUT_DUR_3M,1,1.0
V_IN_DUR_3M,1,1.0
V_OUT_CNT_3M,1,1.0
V_IN_CNT_3M,1,1.0


Можно выделить несколько групп:

* Поле SUPPORT_3G - возможно, пустое поле означает отсутствие поддержки 3G
* Поля, содержащие ночные данные - [DATA_CNT_NGHT_6M, SMS_CNT_NGHT_6M, VOICE_CNT_NGHT_6M, VOICE_DUR_NGHT_6M] - может означать отсутствие данных и их можно поробовать заменить на 0
* По остальным - заменим на среднее значение по столбцу

Проверим, что в SUPPORT_3G пустое поле означает отсутствие поддержки 3G:

In [6]:
raw_training_data['SUPPORT_3G'].append(raw_test_data['SUPPORT_3G']).unique()

array([ nan,   1.])

Процедура чистки данных:

In [7]:
def clean_data(data):
    data['SUPPORT_3G'].fillna(0, inplace=True)
    data['DATA_CNT_NGHT_6M'].fillna(0, inplace=True)
    data['SMS_CNT_NGHT_6M'].fillna(0, inplace=True)
    data['VOICE_CNT_NGHT_6M'].fillna(0, inplace=True)
    data['VOICE_DUR_NGHT_6M'].fillna(0, inplace=True)
    for i in data.columns:
        if data[i].isnull().sum() > 0:
            data[i].fillna(data[i].mean(), inplace=True)

In [8]:
clean_data(raw_training_data)
clean_data(raw_test_data)

pd.DataFrame([raw_training_data.isnull().sum(), raw_test_data.isnull().sum()], index=['Train','Test'], 
             columns=raw_training_data.columns).transpose()

Unnamed: 0,Train,Test
SUBS_ID,0,0.0
AVG_CHARGE_3M_GROUP,0,0.0
V_OUT_INT_DUR_3M,0,0.0
V_IN_INT_DUR_3M,0,0.0
V_OUT_INT_CNT_3M,0,0.0
V_IN_INT_CNT_3M,0,0.0
V_OUT_DUR_3M,0,0.0
V_IN_DUR_3M,0,0.0
V_OUT_CNT_3M,0,0.0
V_IN_CNT_3M,0,0.0


Применим Dummy-кодирование к категориальным данным

In [9]:
def cathegorical_dummy(data, cat):
    res = data.copy()
    for k in cat:
        for i in cat.get(k):
            new_name = k + '_' + i
            res[new_name] = np.where(res[k] == i, 1, 0)
        res.drop(k, axis=1, inplace=True)
    return res

In [10]:
training_data = cathegorical_dummy(raw_training_data, cathegorical)
test_data = cathegorical_dummy(raw_test_data, cathegorical)

## Логистическая регрессия

In [11]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [12]:
model = LogisticRegression()
model = model.fit(training_data.drop(['SUBS_ID', 'IS_RETIREE'], axis=1), training_data['IS_RETIREE'])
print(model)
# make predictions
expected = training_data['IS_RETIREE']
predicted = model.predict_proba(training_data.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
# summarize the fit of the model
#print(metrics.classification_report(expected, predicted))
#print(metrics.confusion_matrix(expected, predicted))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [13]:
predicted_on_test = model.predict_proba(test_data.drop('SUBS_ID', axis=1))

In [14]:
res_to_kaggle = pd.DataFrame([test_data['SUBS_ID'].values, predicted_on_test[:,1]], index=['SUBS_ID', 'IS_RETIREE']).T

In [15]:
res_to_kaggle.to_csv("res_to_kaggle.csv", index = False, quoting=2) #Kaggle in class 0.72488

### То же самое, но PCA до 5 фич

In [16]:
import numpy as np
from sklearn.decomposition import PCA

pca = PCA(n_components=180)
pca.fit(training_data.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))

PCA(copy=True, n_components=180, whiten=False)

In [17]:
pca_train = pca.transform(training_data.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
pca_test = pca.transform(test_data.drop(['SUBS_ID'], axis=1))

In [18]:
model = LogisticRegression()
model = model.fit(pca_train, training_data['IS_RETIREE'])
print(model)
# make prediction
predicted_on_test = model.predict_proba(pca_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [19]:
res_to_kaggle = pd.DataFrame([test_data['SUBS_ID'].values, predicted_on_test[:,1]], index=['SUBS_ID', 'IS_RETIREE']).T

In [20]:
res_to_kaggle.to_csv("res_to_kaggle.csv", index = False, quoting=2)
#Kaggle in class: 5 components = 0.62639; 50 components = 0.72521; 180 components = 0.72509

### Добавим SVM

In [21]:
from sklearn import metrics
from sklearn.svm import SVC
# fit a SVM model to the data
model = SVC(probability = True)
model.fit(pca_train, training_data['IS_RETIREE'])
print(model)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [22]:
predicted_on_test = model.predict_proba(pca_test)

In [23]:
predicted_on_test

array([[ 0.89176711,  0.10823289],
       [ 0.89091179,  0.10908821],
       [ 0.89094788,  0.10905212],
       ..., 
       [ 0.8912263 ,  0.1087737 ],
       [ 0.89080826,  0.10919174],
       [ 0.89221096,  0.10778904]])

In [24]:
res_to_kaggle = pd.DataFrame([test_data['SUBS_ID'].values, predicted_on_test[:,1]], index=['SUBS_ID', 'IS_RETIREE']).T

In [25]:
res_to_kaggle.to_csv("res_to_kaggle.csv", index = False, quoting=2)  #Kaggle in class: 0.62907

## Поиграемся с кросс-валидацией

In [26]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(training_data, train_size=0.8, random_state=12345)


In [27]:
[train['IS_RETIREE'].sum() / float(train.shape[0]),
 test['IS_RETIREE'].sum() / float(test.shape[0])]

[0.10578754578754579, 0.11716461628588166]

In [28]:
from sklearn.metrics import roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestRegressor

def test_method(atrain, atest, param, atype):
    if atype == 0:
        model = SVC(C = param, probability = True) #LogisticRegression() #SVC(probability = True)
        model.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1), atrain['IS_RETIREE'])
    elif atype == 1:
        model = RandomForestRegressor(n_estimators = param)
        model.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1), atrain['IS_RETIREE'])
    elif atype == 2:
        pca = PCA(n_components=180)
        pca.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
        atrain_pca = pca.transform(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
        model = RandomForestRegressor(n_estimators = param)
        model.fit(atrain_pca, atrain['IS_RETIREE'])
    
    
    if atype == 0:
        predicted = model.predict_proba(atest.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
        return [param, roc_auc_score(atest['IS_RETIREE'], predicted[:,np.dot([0, 1], model.classes_)]), model.classes_]
    elif atype == 1:
        predicted = model.predict(atest.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
        predicted = [max(0, min(1, x)) for x in predicted]
        return [param, roc_auc_score(atest['IS_RETIREE'], predicted)]
    elif atype == 2:
        predicted = model.predict(pca.transform(atest.drop(['SUBS_ID', 'IS_RETIREE'], axis=1)))
        predicted = [max(0, min(1, x)) for x in predicted]
        return [param, roc_auc_score(atest['IS_RETIREE'], predicted)]    

In [29]:
# 0.6208891838088918

In [30]:
#gr = np.arange(-5.0, 5.0, 0.5)

#for g in gr:
#    print(test_method(train, test, 10.0 ** g))

In [31]:
#test_method(train, test, 100.0) #[100.0, 0.36365461181154612, array([0, 1], dtype=int64)]

In [32]:
#test_method(train, test, 100.0) #[100.0, 0.3647113470471135, array([0, 1], dtype=int64)]

In [33]:
#test_method(train, test, 100.0) #[100.0, 0.36470305242203049, array([0, 1], dtype=int64)]

In [34]:
#test_method(train, test, 100.0) #[100.0, 0.36470968812209686, array([0, 1], dtype=int64)]

In [35]:
#test_method(train, test, 100.0) #[100.0, 0.3647279362972794, array([0, 1], dtype=int64)]

In [36]:
#test_method(train, test, 1.0) #[1.0, 0.3791108161911082, array([0, 1], dtype=int64)]

In [37]:
#test_method(train, test, 0.0001) #[0.0001, 0.57934804246848048, array([0, 1], dtype=int64)]

In [38]:
#test_method(train, test, 31622.776601683792) #[31622.776601683792, 0.6804727936297279, array([0, 1], dtype=int64)]

In [39]:
#test_method(train, test, 100.1) #[100.1, 0.33307730590577306, array([0, 1], dtype=int64)]

In [40]:
#test_method(train, test, 101) #[101, 0.66063536828135372, array([0, 1], dtype=int64)]

In [41]:
#gr = np.arange(100, 101, 0.1)

#for g in gr:
#    print(test_method(train, test,  g))

In [42]:
#test_method(train, test, 500, 1) #[500, 0.70621101526211028]

In [43]:
#test_method(train, test, 500, 2) #[500, 0.69130225613802254]

Запишем в файл

In [79]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import BernoulliRBM
from sklearn.svm import SVR
from sknn.mlp import Regressor, Layer

def use_method(atrain, atest, param, atype):
    if atype == 0:
        model = SVC(C = param, probability = True) #LogisticRegression() #SVC(probability = True)
        model.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1), atrain['IS_RETIREE'])
    elif atype == 1:
        model = RandomForestRegressor(n_estimators = param)
        model.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1), atrain['IS_RETIREE'])
    elif atype == 2:
        pca = PCA(n_components=180)
        pca.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
        atrain_pca = pca.transform(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
        model = RandomForestRegressor(n_estimators = param)
        model.fit(atrain_pca, atrain['IS_RETIREE'])
    elif atype == 3:
        model = SVR(C = param) 
        model.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1), atrain['IS_RETIREE'])
    elif atype == 4:
        model = MultinomialNB() 
        model.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1), atrain['IS_RETIREE'])    
    elif atype == 5:
        model = Regressor(
            layers=[
                Layer("Linear", units=5),
                Layer("Sigmoid", units=1)#,
                #Layer("Sigmoid", units=1)
            ],
            learning_rate=0.02,
            n_iter=1000)
        model.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1).as_matrix(), atrain['IS_RETIREE'])
    elif atype == 6:
        pca = PCA(n_components=60)
        pca.fit(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
        atrain_pca = pca.transform(atrain.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))
        model = Regressor(
            layers=[
                Layer("Linear", units=5),
                Layer("Sigmoid", units=1)#,
                #Layer("Sigmoid", units=1)
            ],
            learning_rate=0.02,
            n_iter=1000)
        model.fit(atrain_pca, atrain['IS_RETIREE'])
 
    
    
    if atype == 0:
        predicted = model.predict_proba(atest.drop(['SUBS_ID'], axis=1))
        res_to_kaggle = pd.DataFrame([atest['SUBS_ID'].values, predicted[:,np.dot([0, 1], model.classes_)]],
                                      index=['SUBS_ID', 'IS_RETIREE']).T
    elif atype == 1:
        predicted = model.predict(atest.drop(['SUBS_ID'], axis=1))
        predicted = [max(0, min(1, x)) for x in predicted]
        res_to_kaggle = pd.DataFrame([atest['SUBS_ID'].values, predicted], index=['SUBS_ID', 'IS_RETIREE']).T
    elif atype == 2:
        predicted = model.predict(pca.transform(atest.drop(['SUBS_ID'], axis=1)))
        predicted = [max(0, min(1, x)) for x in predicted]
        res_to_kaggle = pd.DataFrame([atest['SUBS_ID'].values, predicted], index=['SUBS_ID', 'IS_RETIREE']).T
    elif atype == 3:
        predicted = model.predict(atest.drop(['SUBS_ID'], axis=1))
        predicted = [max(0, min(1, x)) for x in predicted]
        res_to_kaggle = pd.DataFrame([atest['SUBS_ID'].values, predicted], index=['SUBS_ID', 'IS_RETIREE']).T
    elif atype == 4:
        predicted = model.predict_proba(atest.drop(['SUBS_ID'], axis=1))
        res_to_kaggle = pd.DataFrame([atest['SUBS_ID'].values, predicted[:,np.dot([0, 1], model.classes_)]],
                                      index=['SUBS_ID', 'IS_RETIREE']).T
    elif atype == 5:
        predicted = model.predict(atest.drop(['SUBS_ID'], axis=1).as_matrix())[:,0]
        predicted = [max(0, min(1, x)) for x in predicted]
        res_to_kaggle = pd.DataFrame([atest['SUBS_ID'].values, predicted], index=['SUBS_ID', 'IS_RETIREE']).T
    elif atype == 6:
        predicted = model.predict(pca.transform(atest.drop(['SUBS_ID'], axis=1)))[:,0]
        predicted = [max(0, min(1, x)) for x in predicted]
        res_to_kaggle = pd.DataFrame([atest['SUBS_ID'].values, predicted], index=['SUBS_ID', 'IS_RETIREE']).T
        
    res_to_kaggle.to_csv("res_to_kaggle.csv", index = False, quoting=2) 

In [45]:
#use_method(training_data, test_data, 500, 2) #0.70819

## Попробуем поиграться с логической регрессией

In [48]:
#from sklearn import grid_search

In [49]:

#parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
#model = SVC(probability = True)
#clf = grid_search.GridSearchCV(model, parameters)

#clf.fit(train.drop(['SUBS_ID', 'IS_RETIREE'], axis=1), train['IS_RETIREE'])
#print(clf)

#predicted = model.predict_proba(atest.drop(['SUBS_ID', 'IS_RETIREE'], axis=1))


In [50]:
#use_method(training_data, test_data, 300, 1) #0.74142

In [51]:
#use_method(training_data, test_data, 1, 3) #0.55057

In [52]:
#use_method(training_data, test_data, 300, 3) #0.66357

In [53]:
#use_method(training_data, test_data, 0, 4) #0.67097

## Пробуем нейросети

In [77]:
use_method(training_data, test_data, 0, 5) 
# 0.69952 - 
# model = Regressor( layers=[ Layer("Linear", units=5), Layer("Sigmoid", units=3), 
# Layer("Sigmoid", units=1) ],learning_rate=0.02, n_iter=10)
# # -----

# 0.71321
# model = Regressor( layers=[ Layer("Linear", units=5), Layer("Sigmoid", units=1) ], learning_rate=0.02, n_iter=1000)

In [80]:
use_method(training_data, test_data, 0, 6) 

In [None]:
# 0.71836
# PCA(n_components=60) + NN model = Regressor( layers=[ Layer("Linear", units=5),
# Layer("Sigmoid", units=1) ], learning_rate=0.02, n_iter=1000)