## Importing libraries

In [20]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

---

## Importing data

In [21]:
y_train_path = os.path.join('data', 'preprocessed_y_train.csv')
x_train_path = os.path.join('data', 'preprocessed_x_train.csv')
x_test_path = os.path.join('data', 'preprocessed_x_test.csv')

y_train = pd.read_csv(y_train_path, index_col = [0])
x_train = pd.read_csv(x_train_path, index_col = [0])
x_test = pd.read_csv(x_test_path, index_col = [0])

In [22]:
y_train.head()

Unnamed: 0,Choice
0,0
1,0
2,0
3,0
4,1


In [23]:
x_train.head()

Unnamed: 0,follower_count,following_count,listed_count,mentions_received,retweets_sent,network_feature_1,combined1,combined2
0,-0.986855,-0.97994,-0.996454,-0.927069,-0.535702,-0.941176,1.086996,-1.877265
1,-0.290388,0.163296,-0.751904,0.381527,0.32636,0.387218,-1.130558,0.935012
2,0.337358,0.431939,-0.34202,0.943998,0.709406,0.938776,1.650897,1.510019
3,-0.997957,-0.999207,-0.985714,-0.973506,-0.972344,-0.993213,-1.714039,-1.745739
4,0.71571,-0.329965,0.871722,0.74654,0.497116,0.732704,0.622613,1.288425


In [24]:
x_test.head()

Unnamed: 0,follower_count,following_count,listed_count,mentions_received,retweets_sent,network_feature_1,combined1,combined2
0,-0.391099,-0.169263,-0.55565,-0.657994,-0.382888,-0.715077,-0.300755,0.145751
1,-0.011451,-0.094035,-0.012355,0.45454,-0.200833,0.5,-1.81265,-1.340918
2,-0.985223,0.825184,-0.954763,-0.995382,0.004156,-0.994401,-0.041999,0.210956
3,-0.912858,-0.48168,-0.94379,-0.771889,0.917062,-0.829787,-1.720317,1.232773
4,-0.045534,-0.995523,0.019681,0.317849,0.705035,0.295968,-0.828373,-1.514095


---

## Importing Models

In [25]:
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

---

In [26]:
class ModelEvaluator():
    def __init__(self, Model, params = {}):
        self.Model = Model
        self.params = params
    
    def kfold_evaluate(self, x_train, y_train, n_splits):
        '''
        Cross-validates the model based on K-Fold of n_splits splits from x_train (features) and y_train (labels). 
        Prints the AUC (Area Under the ROC Curve) mean score for the K-Fold.
        '''
        indexes = np.array(x_train.index)
        np.random.shuffle(indexes)
        x_shuffled = x_train.loc[indexes].copy()
        y_shuffled = y_train.loc[indexes].copy()
        kfold = KFold(n_splits)
        kfold_auc = []

        for (train_indexes, eval_indexes) in kfold.split(x_train):
            kfold_x_train = x_shuffled.iloc[train_indexes].copy()
            kfold_y_train = y_shuffled.iloc[train_indexes].copy()
            kfold_x_eval = x_shuffled.iloc[eval_indexes].copy()
            kfold_y_eval = y_shuffled.iloc[eval_indexes].copy()

            model_instance = self.Model()
            model_instance.set_params(**self.params)
            model_instance.fit(kfold_x_train, kfold_y_train.values.ravel())
            kfold_y_pred = [prob[1] for prob in model_instance.predict_proba(kfold_x_eval)]
            fpr, tpr, thresholds = roc_curve(kfold_y_eval, kfold_y_pred, pos_label=1)
            kfold_auc.append(auc(fpr, tpr))

        print(f'{model_instance} KFold auc: {sum(kfold_auc)/len(kfold_auc)}')
    
    def export_predictions(self, file_name):
        '''
        Exports the predictions from the model in a .csv file.
        Uses the submission standard for the Kaggle's challenge 'Influencers in Social Networks'.
        '''
        model = self.Model()
        model.set_params(**self.params)
        model.fit(x_train, y_train.values.ravel())

        y_pred = [prob[1] for prob in model.predict_proba(x_test)]
        y_pred = pd.DataFrame(y_pred, index = x_test.index.rename('Id'), columns = ['Choice'])
        y_pred.index = y_pred.index + 1

        y_pred_path = os.path.join('data', 'predictions', f'{file_name}.csv')
        y_pred.to_csv(y_pred_path, index = True)

## 1) Logistic Regression

In [27]:
logreg_params = {'random_state': 2,
                 'max_iter': 100,
                 'penalty': None}

logreg_evaluator = ModelEvaluator(LogisticRegression, logreg_params)

In [28]:
logreg_evaluator.kfold_evaluate(x_train, y_train, 10)

LogisticRegression(penalty=None, random_state=2) KFold auc: 0.8561744386871778


In [29]:
logreg_evaluator.export_predictions('logreg_predictions')

## 2) Decision Tree

In [30]:
dt_params = {'random_state': 2,
             'max_depth': 4,
             'min_samples_leaf': 11}

dt_evaluator = ModelEvaluator(DecisionTreeClassifier, dt_params)

In [31]:
dt_evaluator.kfold_evaluate(x_train, y_train, 10)

DecisionTreeClassifier(max_depth=4, min_samples_leaf=11, random_state=2) KFold auc: 0.8428899621125933


In [32]:
dt_evaluator.export_predictions('dt_predictions')

## 3) Random Forest

In [33]:
rf_params = {'random_state': 2,
             'n_estimators': 1000,
             'max_depth': 6,
             'max_features': 'sqrt',
             'min_samples_leaf': 2}

rf_evaluator = ModelEvaluator(RandomForestClassifier, rf_params)

In [34]:
rf_evaluator.kfold_evaluate(x_train, y_train, 5)

RandomForestClassifier(max_depth=6, min_samples_leaf=2, n_estimators=1000,
                       random_state=2) KFold auc: 0.8611853705867658


In [35]:
rf_evaluator.export_predictions('rf_predictions')

## 4) Support Vector Machine

In [36]:
svm_params = {'probability': True,
              'degree': 3}

svm_evaluator = ModelEvaluator(SVC, svm_params)

In [37]:
svm_evaluator.kfold_evaluate(x_train, y_train, 10)

SVC(probability=True) KFold auc: 0.8305574777262933


In [38]:
svm_evaluator.export_predictions('svm_predictions')

## 5) Ridge Classification with Logistic Regression

In [39]:
ridge_params = {'random_state': 2,
                 'max_iter': 100,
                 'penalty': 'l2',
                 'C': 0.5}

ridge_evaluator = ModelEvaluator(LogisticRegression, ridge_params)

In [40]:
ridge_evaluator.kfold_evaluate(x_train, y_train, 5)

LogisticRegression(C=0.5, random_state=2) KFold auc: 0.8560698142589098


In [41]:
ridge_evaluator.export_predictions('ridge_predictions')

## 6) Lasso Classification with Logistic Regression

In [42]:
lasso_params = {'random_state': 2,
                 'max_iter': 100,
                 'solver': 'liblinear',
                 'penalty': 'l1',
                 'C': 0.1}

lasso_evaluator = ModelEvaluator(LogisticRegression, lasso_params)

In [43]:
lasso_evaluator.kfold_evaluate(x_train, y_train, 5)

LogisticRegression(C=0.1, penalty='l1', random_state=2, solver='liblinear') KFold auc: 0.8565326996742855


In [44]:
lasso_evaluator.export_predictions('lasso_predictions')

## 7) Gaussian Naive Bayes

In [45]:
gnb_params = {}

gnb_evaluator = ModelEvaluator(GaussianNB, gnb_params)

In [46]:
gnb_evaluator.kfold_evaluate(x_train, y_train, 10)

GaussianNB() KFold auc: 0.8530458373695213


In [47]:
gnb_evaluator.export_predictions('gnb_predictions')