In [1]:
import os
os.chdir('./../')

In [2]:
import joblib
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer



In [3]:
base_path = './data'

In [4]:
all_df = pd.read_csv(f'{base_path}/train.csv', index_col=0)
final_test_df = pd.read_csv(f'{base_path}/test.csv', index_col=0)

In [5]:
all_train = all_df.loc[:, all_df.columns != 'Class']
all_test = all_df.loc[:, 'Class']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(all_train, 
                                                    all_test, 
                                                    random_state=42)

In [7]:
def delete_cols_from(df: pd.DataFrame, col_names: list):
    for col in col_names:
        if col in df:
            del df[col]

In [8]:
# Performance of baseline model suffers, hence not deleting the correlated columns.
# delete_cols_from(x_train, ['FD', 'GL'])
# delete_cols_from(x_test, ['FD', 'GL'])

In [9]:
index_of_ej = list(x_train.columns).index('EJ')

In [10]:
class CategoricalTransformer:
    
    def __init__(self, index_of_col):
        self._index_of_col = index_of_col
        self._a = None
        self._b = None
    
    def fit(self, x, y):
        if isinstance(x, pd.DataFrame):
            x = x.values
        self._a = y[x[:, self._index_of_col] == 'A'].mean()
        self._b = y[x[:, self._index_of_col] == 'B'].mean()
    
    def transform(self, x):
        if isinstance(x, pd.DataFrame):
            x = x.values
        x[:, self._index_of_col] = np.where(x[:, self._index_of_col] == 'A', self._a, self._b)
        return x
        
    def fit_transform(self, x, y):
        self.fit(x, y)
        return self.transform(x)

In [11]:
pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                        IterativeImputer(initial_strategy='median'),
                        PowerTransformer(),
                        LogisticRegression())

In [12]:
pipeline.fit(x_train.values, y_train)

  loglike = -n_samples / 2 * np.log(x_trans.var())


Pipeline(steps=[('categoricaltransformer',
                 <__main__.CategoricalTransformer object at 0x7f31b87d7640>),
                ('iterativeimputer',
                 IterativeImputer(initial_strategy='median')),
                ('powertransformer', PowerTransformer()),
                ('logisticregression', LogisticRegression())])

In [13]:
log_loss(y_train, pipeline.predict_proba(x_train))

0.16008022602499214

In [14]:
cv_scores = cross_val_score(pipeline, x_train, y_train, cv=5, scoring='neg_log_loss')
cv_scores

array([-0.26179176, -0.24844446, -0.58985217, -0.25304711, -0.42036003])

In [15]:
cv_scores.mean(), cv_scores.std()

(-0.3546991070340024, 0.13406212942824208)

In [16]:
final_model = pipeline.fit(all_train, all_test)

In [17]:
submission_df = pd.DataFrame(pipeline.predict_proba(final_test_df), 
                             index=final_test_df.index,
                             columns=['class_0', 'class_1'])

In [18]:
submission_df

Unnamed: 0_level_0,class_0,class_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
00eed32682bb,1.0,0.0
010ebe33f668,1.0,0.0
02fa521e1838,1.0,0.0
040e15f562a2,1.0,0.0
046e85c7cc7f,1.0,0.0


In [19]:
submission_df.to_csv(f'{base_path}/submission.csv')