## Importing libraries

In [8]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

---

## Importing data

In [9]:
y_train_path = os.path.join('data', 'preprocessed_y_train.csv')
x_train_path = os.path.join('data', 'preprocessed_x_train.csv')
x_test_path = os.path.join('data', 'preprocessed_x_test.csv')

y_train = pd.read_csv(y_train_path, index_col = [0])
x_train = pd.read_csv(x_train_path, index_col = [0])
x_test = pd.read_csv(x_test_path, index_col = [0])

In [10]:
y_train.head()

Unnamed: 0,Choice
0,0
1,0
2,0
3,0
4,1


In [11]:
x_train.head()

Unnamed: 0,follower_count,following_count,listed_count,mentions_received,retweets_received,mentions_sent,retweets_sent,posts,network_feature_1,network_feature_2,network_feature_3
0,-0.986855,-0.97994,-0.996454,-0.927069,-0.950788,-0.975796,-0.535702,-0.901469,-0.941176,0.375861,0.711135
1,-0.290388,0.163296,-0.751904,0.381527,0.48514,0.277683,0.32636,0.65733,0.387218,-0.754868,-0.375691
2,0.337358,0.431939,-0.34202,0.943998,0.928486,0.797743,0.709406,0.712276,0.938776,0.739259,0.911638
3,-0.997957,-0.999207,-0.985714,-0.973506,-0.99858,-0.907802,-0.972344,-0.837938,-0.993213,-0.842948,-0.871091
4,0.71571,-0.329965,0.871722,0.74654,0.823479,0.609503,0.497116,0.678922,0.732704,0.448564,0.17405


In [12]:
x_test.head()

Unnamed: 0,follower_count,following_count,listed_count,mentions_received,retweets_received,mentions_sent,retweets_sent,posts,network_feature_1,network_feature_2,network_feature_3
0,-0.391099,-0.169263,-0.55565,-0.657994,-0.790458,0.353497,-0.382888,-0.207745,-0.715077,-0.190526,-0.110229
1,-0.011451,-0.094035,-0.012355,0.45454,0.800918,-0.786232,-0.200833,-0.554686,0.5,-0.898676,-0.913974
2,-0.985223,0.825184,-0.954763,-0.995382,-0.995333,0.509898,0.004156,-0.298942,-0.994401,0.090114,-0.132113
3,-0.912858,-0.48168,-0.94379,-0.771889,-0.829438,0.587468,0.917062,0.645305,-0.829787,-0.915854,-0.804463
4,-0.045534,-0.995523,0.019681,0.317849,0.487645,-0.669636,0.705035,-0.84446,0.295968,-0.56,-0.268373


---

## Importing Models

In [115]:
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

---

In [14]:
class ModelEvaluator():
    def __init__(self, Model, params = {}):
        self.Model = Model
        self.params = params
    
    def kfold_evaluate(self, x_train, y_train, n_splits):
        '''
        Cross-validates the model based on K-Fold of n_splits splits from x_train (features) and y_train (labels). 
        Prints the AUC (Area Under the ROC Curve) mean score for the K-Fold.
        '''
        indexes = np.array(x_train.index)
        np.random.shuffle(indexes)
        x_shuffled = x_train.loc[indexes].copy()
        y_shuffled = y_train.loc[indexes].copy()
        kfold = KFold(n_splits)
        kfold_auc = []

        for (train_indexes, eval_indexes) in kfold.split(x_train):
            kfold_x_train = x_shuffled.iloc[train_indexes].copy()
            kfold_y_train = y_shuffled.iloc[train_indexes].copy()
            kfold_x_eval = x_shuffled.iloc[eval_indexes].copy()
            kfold_y_eval = y_shuffled.iloc[eval_indexes].copy()

            model_instance = self.Model()
            model_instance.set_params(**self.params)
            model_instance.fit(kfold_x_train, kfold_y_train.values.ravel())
            kfold_y_pred = [prob[1] for prob in model_instance.predict_proba(kfold_x_eval)]
            fpr, tpr, thresholds = roc_curve(kfold_y_eval, kfold_y_pred, pos_label=1)
            kfold_auc.append(auc(fpr, tpr))

        print(f'{model_instance} KFold auc: {sum(kfold_auc)/len(kfold_auc)}')
    
    def export_predictions(self, file_name):
        '''
        Exports the predictions from the model in a .csv file.
        Uses the submission standard for the Kaggle's challenge 'Influencers in Social Networks'.
        '''
        model = self.Model()
        model.set_params(**self.params)
        model.fit(x_train, y_train.values.ravel())

        y_pred = [prob[1] for prob in model.predict_proba(x_test)]
        y_pred = pd.DataFrame(y_pred, index = x_test.index.rename('Id'), columns = ['Choice'])
        y_pred.index = y_pred.index + 1

        y_pred_path = os.path.join('data', f'{file_name}.csv')
        y_pred.to_csv(y_pred_path, index = True)

## 1) Logistic Regression

In [39]:
logreg_params = {'random_state': 2,
                 'max_iter': 100,
                 'penalty': None}

logreg_evaluator = ModelEvaluator(LogisticRegression, logreg_params)

In [40]:
logreg_evaluator.kfold_evaluate(x_train, y_train, 10)

LogisticRegression(penalty=None, random_state=2) KFold auc: 0.8560347069117036


In [41]:
logreg_evaluator.export_predictions('logreg_predictions')

## 2) Decision Tree

In [159]:
dt_params = {'random_state': 2,
             'max_depth': 4,
             'min_samples_leaf': 11}

dt_evaluator = ModelEvaluator(DecisionTreeClassifier, dt_params)

In [160]:
dt_evaluator.kfold_evaluate(x_train, y_train, 10)

DecisionTreeClassifier(max_depth=4, min_samples_leaf=11, random_state=2) KFold auc: 0.8467281214818708


In [161]:
dt_evaluator.export_predictions('dt_predictions')

## 3) Random Forest

In [18]:
rf_params = {'random_state': 2,
             'n_estimators': 1000,
             'max_depth': 6,
             'max_features': 'sqrt',
             'min_samples_leaf': 2}

rf_evaluator = ModelEvaluator(RandomForestClassifier, rf_params)

In [19]:
rf_evaluator.kfold_evaluate(x_train, y_train, 5)

RandomForestClassifier(max_depth=6, min_samples_leaf=2, n_estimators=1000,
                       random_state=2) KFold auc: 0.859991163835061


In [20]:
rf_evaluator.export_predictions('rf_predictions')

## 4) Support Vector Machine

In [34]:
svm_params = {'probability': True,
              'degree': 3}

svm_evaluator = ModelEvaluator(SVC, svm_params)

In [35]:
svm_evaluator.kfold_evaluate(x_train, y_train, 10)

SVC(probability=True) KFold auc: 0.8308751885974089


In [36]:
svm_evaluator.export_predictions('svm_predictions')

## 5) Ridge Classification with Logistic Regression

In [96]:
ridge_params = {'random_state': 2,
                 'max_iter': 100,
                 'penalty': 'l2',
                 'C': 0.5}

ridge_evaluator = ModelEvaluator(LogisticRegression, ridge_params)

In [97]:
ridge_evaluator.kfold_evaluate(x_train, y_train, 5)

LogisticRegression(C=0.5, random_state=2) KFold auc: 0.8563920997843187


In [93]:
ridge_evaluator.export_predictions('ridge_predictions')

## 6) Lasso Classification with Logistic Regression

In [111]:
lasso_params = {'random_state': 2,
                 'max_iter': 100,
                 'solver': 'liblinear',
                 'penalty': 'l1',
                 'C': 0.1}

lasso_evaluator = ModelEvaluator(LogisticRegression, lasso_params)

In [112]:
lasso_evaluator.kfold_evaluate(x_train, y_train, 5)

LogisticRegression(C=0.1, penalty='l1', random_state=2, solver='liblinear') KFold auc: 0.8564418249890082


In [113]:
lasso_evaluator.export_predictions('lasso_predictions')

## 7) Gaussian Naive Bayes

In [164]:
gnb_params = {}

gnb_evaluator = ModelEvaluator(GaussianNB, gnb_params)

In [167]:
gnb_evaluator.kfold_evaluate(x_train, y_train, 10)

GaussianNB() KFold auc: 0.849752698441112


In [168]:
gnb_evaluator.export_predictions('gnb_predictions')