# libraries

In [1]:
# standard library for data manipulation
import pandas as pd
# automatic model optimizing
from tpot import TPOTClassifier
# metrics for evaluation
from sklearn.metrics import classification_report

# data

In [2]:
X_train = pd.read_csv("../data/2_processed/X_train_pretrained.csv")
y_train = pd.read_csv("../data/2_processed/y_train_pretrained.csv")
X_test = pd.read_csv("../data/2_processed/X_test_pretrained.csv")
y_test = pd.read_csv("../data/2_processed/y_test_pretrained.csv")

# model

## tpot [takes some time]

In [29]:
# substitute X_train_cv with X_train_tfidf or X_train_pretrained and analog X_test_cv to try different combinations out
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)

pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))



  return f(*args, **kwargs)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(MaxAbsScaler(input_matrix), bootstrap=False, criterion=gini, max_features=0.05, min_samples_leaf=4, min_samples_split=12, n_estimators=100)


  return f(*args, **kwargs)


0.7284348727398364


In [None]:
# export tpot model
pipeline_optimizer.export('tpot_exported_pipeline.py')

## test different standard models

In [3]:
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression(max_iter=3000)

# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall = {}, {}, {}

for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)
    
    # Prediction 
    predictions = models[key].predict(X_test)
    print("classificationreport ",key,":")
    print(classification_report(y_test, predictions))
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = accuracy_score(y_test, predictions)
    precision[key] = precision_score(y_test, predictions)
    recall[key] = recall_score(y_test, predictions)

  return f(*args, **kwargs)


classificationreport  Logistic Regression :
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      4580
           1       0.77      0.75      0.76      4103

    accuracy                           0.78      8683
   macro avg       0.77      0.77      0.77      8683
weighted avg       0.78      0.78      0.77      8683



  return f(*args, **kwargs)


classificationreport  Support Vector Machines :
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      4580
           1       0.77      0.75      0.76      4103

    accuracy                           0.78      8683
   macro avg       0.78      0.77      0.78      8683
weighted avg       0.78      0.78      0.78      8683

classificationreport  Decision Trees :
              precision    recall  f1-score   support

           0       0.64      0.62      0.63      4580
           1       0.59      0.60      0.60      4103

    accuracy                           0.61      8683
   macro avg       0.61      0.61      0.61      8683
weighted avg       0.61      0.61      0.61      8683



  models[key].fit(X_train, y_train)


classificationreport  Random Forest :
              precision    recall  f1-score   support

           0       0.72      0.80      0.76      4580
           1       0.74      0.66      0.70      4103

    accuracy                           0.73      8683
   macro avg       0.73      0.73      0.73      8683
weighted avg       0.73      0.73      0.73      8683



  return f(*args, **kwargs)


classificationreport  Naive Bayes :
              precision    recall  f1-score   support

           0       0.67      0.71      0.69      4580
           1       0.65      0.61      0.63      4103

    accuracy                           0.66      8683
   macro avg       0.66      0.66      0.66      8683
weighted avg       0.66      0.66      0.66      8683



  return self._fit(X, y)


classificationreport  K-Nearest Neighbor :
              precision    recall  f1-score   support

           0       0.70      0.77      0.74      4580
           1       0.71      0.64      0.68      4103

    accuracy                           0.71      8683
   macro avg       0.71      0.71      0.71      8683
weighted avg       0.71      0.71      0.71      8683



In [5]:
import pandas as pd

df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Accuracy,Precision,Recall
Logistic Regression,0.775193,0.770294,0.747014
Support Vector Machines,0.776229,0.771084,0.74872
Decision Trees,0.614189,0.58975,0.602973
Random Forest,0.730623,0.742308,0.658543
Naive Bayes,0.660025,0.6503,0.606873
K-Nearest Neighbor,0.709202,0.714986,0.639532
