In [36]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss


In [7]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

## One-hot-end encoding for features

In [9]:
class CategoricalFeatureEncoder():
    
    def __init__(self, cat_features):
        
        self.cat_features = cat_features
        self.features = {}
        self.features_num = {}
     
   
    def fit_transform(self, X_fit):
        self.fit(X_fit)
        return self.transform(X_fit)
    
    def fit(self, X_fit):
        
        for feature_name in self.cat_features:
            col = X_fit.loc[:,feature_name]
            self.fit_column(col)
        
                
    def transform(self, X_transform):
        for feature_name in self.cat_features:
        
            col = X_transform.loc[:,feature_name] 
            OHC_column = self.transform_column(col)

            #create new names
            col_names = []
            for i in xrange(self.features_num[feature_name]):
                col_names.append(feature_name + '__' + str(i))

                
                
            OCH_pd = pd.DataFrame(OHC_column, columns=col_names)
            X_transform = pd.concat([X_transform, OCH_pd], axis=1).drop(feature_name, axis=1)
            
            
        return X_transform
        
        
    def fit_column(self, col):
        
        col_name = col.name
        col = col.dropna()
        col = np.array(col)
        
        self.features[col_name] = {k: v for v, k in enumerate(list(np.unique(col)))}    
        self.features_num[col_name] = len(self.features[col_name])
    
    def transform_column(self, col):
        
        col_name = col.name
        col = np.array(col)
        OHC_column = np.zeros((col.shape[0], self.features_num[col_name]))

        for i in xrange(col.shape[0]):
            if col[i] in self.features[col_name]:
                
                OHC_column[i, self.features[col_name][col[i]]] = 1
        
        return OHC_column
    

## v22 feature

In [10]:
class cat_feature:

    def __init__(self, n_folds=3, alpha=1):
         
        self.n_folds = n_folds
        self.alpha = alpha       

        self.features_dict = {}

    def fit_fold(self, col):
        
        for i in xrange(col.shape[0]):
            pass
        
    def fit_transform(self, data, target, feature):
        
        self.current_feature = feature
        self.globmean = target.mean()
        
        #create new feature
        new_feature = feature + '__m'
        
        data[new_feature] = 0
        kf = KFold(n_splits=self.n_folds)
        
        new_feature_column = np.zeros(data.shape[0])
        
        for ind1, ind2 in kf.split(data):
            
            # find smoothed mean for every feature value in ind1
            
            # create value - targets dict in current fold
            
            fold_dict = {}
            for i in ind1:
               
                if data[feature][i] in fold_dict:
                    fold_dict[data[feature][i]].append(target[i])
                    
                else:
                    fold_dict[data[feature][i]] = [target[i]]
        
            # convert fold dict
            for i in fold_dict:
                
                K = len(fold_dict[i])
                meanY = np.mean(fold_dict[i])
                fold_dict[i] =  (meanY * K + self.globmean * self.alpha) / (K + self.alpha)
                
            fold_dict[np.nan] = target.mean()
            
            # create new feature for every feature value in ind2        
            for i in ind2:
            
                if data[feature][i] in fold_dict:
                    new_feature_column[i] = fold_dict[data[feature][i]]
                
                else:
                    new_feature_column[i] = fold_dict[np.nan]
                    
        # add new feature
        data[new_feature] = new_feature_column
        
        #create dict from all data to use in transform()
        
        feature_dict = {}
        for i in xrange(data.shape[0]):
            
            if data[feature][i] in feature_dict:
                feature_dict[data[feature][i]].append(target[i])
                    
            else:
                feature_dict[data[feature][i]] = [target[i]]
                
        
        for i in feature_dict:
                
                K = len(feature_dict[i])
                meanY = np.mean(feature_dict[i])
                feature_dict[i] =  (meanY * K + self.globmean * self.alpha) / (K + self.alpha)
                
                
        feature_dict[np.nan] = target.mean()
        self.features_dict[feature] = feature_dict
        
    def transform(self, data, feature):
        
        new_feature = feature + '__m'
        new_feature_column = np.zeros(data.shape[0])
        data[new_feature] = 0
        
        for i in xrange(data.shape[0]):
            
            if data[feature][i] in self.features_dict[feature]:
                new_feature_column[i] = self.features_dict[feature][data[feature][i]]
                
            else:
                new_feature_column[i] = self.features_dict[feature][np.nan]
                
        data[new_feature] = new_feature_column

## Load and prepare data

In [119]:
data_train = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')

y_data_train = data_train.loc[:,'target'].values

#train
data_train = data_train.drop(['target', 'ID'], axis=1)
data_train = data_train.fillna(data_train.median())
#data_train = data_train.fillna('NaaN')

#test 
subm_id = data_test.ID
data_test = data_test.drop(['ID'], axis=1)
data_test = data_test.fillna(data_test.median())
#data_test = data_test.fillna('NaaN')


## Prepare data

In [120]:
cat_features = ['v3', 'v24', 'v30', 'v31', 'v38', 'v47', 'v52', 'v56', 'v62', 'v66', 
                'v71', 'v74', 'v75', 'v79', 'v91', 'v107', 'v110', 'v112', 'v113', 'v125', 'v129']

fe = CategoricalFeatureEncoder(cat_features)

### Векторизация категориальных признаков

In [121]:
X_train_, X_test_, y_train, y_test = train_test_split(data_train, y_data_train, test_size=0.1, random_state=42)


X_train_ = X_train_.reset_index(drop=True)
X_test_ = X_test_.reset_index(drop=True)

X_train_ = fe.fit_transform(X_train_)
X_test_ = fe.transform(X_test_)

### Новый признак из v22

In [122]:
%%time

CF = cat_feature(n_folds=4, alpha=10)
CF.fit_transform(X_train_, y_train, 'v22')
CF.transform(X_test_, 'v22')

CPU times: user 15.2 s, sys: 40.2 ms, total: 15.2 s
Wall time: 15.2 s


In [136]:
X_train = X_train_.drop(['v22'], axis=1)
X_test = X_test_.drop(['v22'], axis=1)

In [146]:
X_train = X_train_.drop(['v22__m', 'v22'], axis=1)
X_test = X_test_.drop(['v22__m', 'v22'], axis=1)

## Predict

## XGB

In [None]:
%%time
clf_xgb = xgb.XGBClassifier(n_estimators=1000, n_jobs=8, silent=False, max_depth=5, random_state=1, learning_rate=0.01)

clf_xgb.fit(X_train, y_train)
print 'fit done'

p_xgb_pred = clf_xgb.predict_proba(X_test)[:,1]
y_xgb_pred = clf_xgb.predict(X_test)

In [None]:
print 'acc = ', accuracy_score(y_test, y_xgb_pred)
print 'log_loss = ', log_loss(y_test, p_xgb_pred)

0.45818499749 с 22
0.459473795842 без

## Logistic regression

In [38]:
%%time

clf_log = LogisticRegression(C=1)

clf_log.fit(X_train, y_train)
print 'fit done'
p_log_pred = clf_log.predict_proba(X_test)[:,1]

y_log_pred = clf_log.predict(X_test)

 fit done
CPU times: user 1min 23s, sys: 468 ms, total: 1min 24s
Wall time: 1min 24s


In [55]:
print 'acc = ', accuracy_score(y_test, y_log_pred)
print 'log_loss = ', log_loss(y_test, p_log_pred)

acc =  0.778360885157
log_loss =  0.473968284769


## Predict

In [63]:
X_subm = fe.transform(data_test)
CF.transform(X_subm, 'v22')
X_subm = X_subm.drop('v22', axis=1)

In [64]:
p_subm = clf_xgb.predict_proba(X_subm)[:,1]

In [65]:
SS = pd.read_csv('data/sample_submission.csv')
SS.head()

Unnamed: 0,ID,PredictedProb
0,0,0.5
1,1,0.5
2,2,0.5
3,7,0.5
4,10,0.5


In [66]:
p_subm

array([ 0.30995306,  0.90285206,  0.87846667, ...,  0.86420178,
        0.88268155,  0.49221018], dtype=float32)

In [67]:
SUBM = pd.DataFrame(columns=['ID', 'PredictedProb'])

In [68]:
SUBM.ID = subm_id
SUBM.PredictedProb = p_subm

In [69]:
SUBM.head()

Unnamed: 0,ID,PredictedProb
0,0,0.309953
1,1,0.902852
2,2,0.878467
3,7,0.602573
4,10,0.748438


In [71]:
SUBM.to_csv('data/subm.csv', index=False)