In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

## One-hot-end encoding for features

In [3]:
def log_loss_score(y_true, p_pred):
    for i in xrange(p_pred.shape[0]):
        if p_pred[i] < 1e-15:
            p_pred[i] = 1e-15
            
        elif p_pred[i] > 1-1e-15:
            p_pred[i] = 1-1e-15
            
    return  - np.mean( y_true * np.log(p_pred) + (1-y_true) * np.log(1-p_pred) )

In [4]:
class CategoricalFeatureEncoder():
    
    def __init__(self, cat_features):
        
        self.cat_features = cat_features
        self.features = {}
        self.features_num = {}
     
   
    def fit_transform(self, X_fit):
        self.fit(X_fit)
        return self.transform(X_fit)
    
    def fit(self, X_fit):
        
        for feature_name in self.cat_features:
            col = X_fit.loc[:,feature_name]
            self.fit_column(col)
        
                
    def transform(self, X_transform):
        for feature_name in self.cat_features:
        
            col = X_transform.loc[:,feature_name] 
            OHC_column = self.transform_column(col)

            #create new names
            col_names = []
            for i in xrange(self.features_num[feature_name]):
                col_names.append(feature_name + '__' + str(i))

                
                
            OCH_pd = pd.DataFrame(OHC_column, columns=col_names)
            X_transform = pd.concat([X_transform, OCH_pd], axis=1).drop(feature_name, axis=1)
            
            
        return X_transform
        
        
    def fit_column(self, col):
        
        col_name = col.name
        col = col.dropna()
        col = np.array(col)
        
        self.features[col_name] = {k: v for v, k in enumerate(list(np.unique(col)))}    
        self.features_num[col_name] = len(self.features[col_name])
    
    def transform_column(self, col):
        
        col_name = col.name
        col = np.array(col)
        OHC_column = np.zeros((col.shape[0], self.features_num[col_name]))

        for i in xrange(col.shape[0]):
            if col[i] in self.features[col_name]:
                
                OHC_column[i, self.features[col_name][col[i]]] = 1
        
        return OHC_column
    

## v22 feature

In [5]:
class v22_feature():
    
    def __init__(self, alpha=0):
        self.v22_dict = {}
        self.globmean = 0
        self.alpha = alpha
        
        
    def fit(self, X_fit, target_fit):
        
        self.globmean = target_fit.mean()
        
        for i in xrange(X_fit.shape[0]):
    
            if X_fit.v22[i] in self.v22_dict:
                self.v22_dict[X_fit.v22[i]].append(target_fit[i])
                
            else:
                self.v22_dict[X_fit.v22[i]] = [ target_fit[i] ]
        
        
        for i in self.v22_dict:
            #self.v22_dict[i] = np.median(self.v22_dict[i])
            #self.v22_dict[i] = np.mean(self.v22_dict[i])
            K = len(self.v22_dict[i])
            meanY = np.mean(self.v22_dict[i])
            self.v22_dict[i] =  (meanY * K + self.globmean * self.alpha) / (K + self.alpha)

        self.v22_dict[np.nan] = target_fit.mean()
        
    def transform(self, X_transform):
        v22_f = np.zeros(X_transform.shape[0])

        for i in range(X_transform.shape[0]):
            
            if X_transform.v22[i] in self.v22_dict:
                v22_f[i] = self.v22_dict[X_transform.v22[i]]
                
            else:
                v22_f[i] = self.v22_dict[np.nan]
        
        X_transform['v22_f'] = v22_f
        
        return X_transform

## Load and prepare data

In [6]:
data_train = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')

y_data_train = data_train.loc[:,'target'].values

#train
data_train = data_train.drop(['target', 'ID'], axis=1)
data_train = data_train.fillna(data_train.median())
#data_train = data_train.fillna('NaaN')

#test
subm_id = data_test.ID
data_test = data_test.drop(['ID'], axis=1)
data_test = data_test.fillna(data_test.median())
#data_test = data_test.fillna('NaaN')

#пока дропаем v22
#data_train = data_train.drop(['v22'], axis=1)
# data_test = data_test.drop(['v22'], axis=1)

## Prepare data

In [7]:

cat_features = ['v3', 'v24', 'v30', 'v31', 'v38', 'v47', 'v52', 'v56', 'v62', 'v66', 
                'v71', 'v74', 'v75', 'v79', 'v91', 'v107', 'v110', 'v112', 'v113', 'v125', 'v129']

fe = CategoricalFeatureEncoder(cat_features)

### Векторизация категориальных признаков

In [22]:
X_train1, X_test_, y_train1, y_test = train_test_split(data_train, y_data_train, test_size=0.1, random_state=42)


X_train1 = X_train1.reset_index(drop=True)
X_test_ = X_test_.reset_index(drop=True)

X_train1 = fe.fit_transform(X_train1)
X_test_ = fe.transform(X_test_)

### Новый признак из v22

In [23]:
X_train_, X_train22, y_train, y_train22 = train_test_split(X_train1, y_train1, test_size=0.5, random_state=42)

X_train22 = X_train22.reset_index(drop=True)
X_train_ = X_train_.reset_index(drop=True)

In [24]:
fe22 = v22_feature(alpha=10)
fe22.fit(X_train22, y_train22)

X_train_ = fe22.transform(X_train_)
X_test_ = fe22.transform(X_test_)

In [None]:
X_train = X_train_.drop('v22', axis=1)
X_test = X_test_.drop('v22', axis=1)

## Predict

## XGB

In [26]:
%%time
clf_xgb = xgb.XGBClassifier(n_estimators=1500, n_jobs=8, silent=False, max_depth=5, random_state=1, learning_rate=0.01)

clf_xgb.fit(X_train, y_train)
print 'fit done'

p_xgb_pred = clf_xgb.predict_proba(X_test)[:,1]
y_xgb_pred = clf_xgb.predict(X_test)

fit done
CPU times: user 36min 51s, sys: 1.23 s, total: 36min 52s
Wall time: 4min 37s


In [32]:
print 'acc = ', accuracy_score(y_test, y_xgb_pred)
print 'log_loss = ', log_loss_score(y_test, p_xgb_pred)

acc =  0.781946995539
log_loss =  0.460593128677


## Logistic regression

In [14]:
%%time

clf_log = LogisticRegression(C=1)

clf_log.fit(X_train, y_train)
print 'fit done'
p_log_pred = clf_log.predict_proba(X_test)[:,1]

y_log_pred = clf_log.predict(X_test)

fit done
CPU times: user 1min 5s, sys: 443 ms, total: 1min 5s
Wall time: 1min 5s


In [17]:
print 'acc = ', accuracy_score(y_test, y_log_pred)
print 'log_loss = ', log_loss_score(y_test, p_log_pred)

acc =  0.774424910347
log_loss =  0.47511210859


## Predict

In [33]:
X_subm = fe.transform(data_test)
X_subm = fe22.transform(X_subm)
X_subm = X_subm.drop('v22', axis=1)

In [38]:
p_subm = clf_xgb.predict_proba(X_subm)[:,1]

In [36]:
SS = pd.read_csv('data/subm.csv')
SS.head()

Unnamed: 0,ID,PredictedProb
0,0,0.213102
1,1,0.882937
2,2,0.832178
3,7,0.610653
4,10,0.780381


In [39]:
p_subm

array([ 0.32206333,  0.85400695,  0.8185541 , ...,  0.852319  ,
        0.88559425,  0.51814526], dtype=float32)

In [40]:
SUBM = pd.DataFrame(columns=['ID', 'PredictedProb'])

In [43]:
SUBM.ID = subm_id
SUBM.PredictedProb = p_subm

In [45]:
SUBM.head()

Unnamed: 0,ID,PredictedProb
0,0,0.322063
1,1,0.854007
2,2,0.818554
3,7,0.628068
4,10,0.744074


In [47]:
SUBM.to_csv('data/subm.csv')