### General Imports

In [66]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_absolute_error, accuracy_score, make_scorer, precision_score, recall_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import Normalizer

from sklearn.decomposition import PCA

mpl.rc(group='figure', figsize=(10,8))
plt.style.use('seaborn')

from warnings import filterwarnings
filterwarnings('ignore')

## Our Imports

In [67]:
import dataset_parser
from submission_generator import generate_submission

#split using train_test_split
from functions import splitData
#gridsearchCV
from functions import grid_search_cross_val

## Parse dataset flat

In [68]:
x_train, y_train, x_test = dataset_parser.parseFlat()

In [69]:
x_train.shape

(15485, 500)

## Split test train

In [70]:
X_train, X_test, Y_train, Y_test = splitData(x_train, y_train)

In [71]:
X_train.shape

(12388, 500)

## Normalizing

In [72]:
normalizer = Normalizer()
normalizer.fit(X_train)

X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)

## PCA

In [73]:
pca = PCA()
pca.fit(X_train)

X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

## Hyper parameter tuning

In [74]:
C_range = 10.0 ** np.arange(-4, 5)

param_grid = {'penalty': ['l1', 'l2'], 'C': C_range}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [75]:
model = LogisticRegression()

best_model = grid_search_cross_val(model,X_train,Y_train,param_grid,scoring='roc_auc')

Best score: 0.908956230963
Best params: {'penalty': 'l1', 'C': 1.0}


In [76]:
best_model.fit(X_train, Y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([  1.00000e-04,   1.00000e-03,   1.00000e-02,   1.00000e-01,
         1.00000e+00,   1.00000e+01,   1.00000e+02,   1.00000e+03,
         1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [77]:
Y_predict = best_model.predict(X_test)

In [78]:
accuracy_score(Y_test, Y_predict)

0.85954149176622541

## for submission

In [79]:
normalizer.fit(x_train)
x_train = normalizer.transform(x_train)
x_test = normalizer.transform(x_test)

pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)

best_model.fit(x_train, y_train)
y_pred = best_model.predict_proba(x_test)

In [80]:
y_pred

array([[ 0.6922944 ,  0.3077056 ],
       [ 0.3765742 ,  0.6234258 ],
       [ 0.05013616,  0.94986384],
       ..., 
       [ 0.18619785,  0.81380215],
       [ 0.95308146,  0.04691854],
       [ 0.51551951,  0.48448049]])

In [81]:
generate_submission(y_pred[:,1].ravel(), "LogisticRegMod")