In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from time import time
import pandas as pd

In [2]:
def train_model(data, model):
    X_train, X_valid, y_train, y_valid = train_test_split(data.drop('is_fake', axis=1), data['is_fake'],
                                                          test_size=0.15, random_state=13)

    t0 = time()
    model.fit(X_train, y_train)
    t1 = time()
    y_pred_valid = model.predict(X_valid)
    t2 = time()

    time_train = t1 - t0
    time_pred_valid = t2 - t1

    print(model)
    print(f'Training time: {time_train}')
    print(f'Prediction time (validation): {time_pred_valid}')
    print('Accuracy score train set :', model.score(X_train, y_train))
    print('Accuracy score valid set  :', accuracy_score(y_valid, y_pred_valid))
    print('Validation classification report: ')
    print(classification_report(y_valid, y_pred_valid))
    print('Test classification report: ')
    
    print('\n -------------------------------------------------------------------------------------- \n')

In [3]:
path = 'new data\\'
name_train = 'new_df_train.csv'

df = pd.read_csv(path + name_train)

In [4]:
# for model in [LogisticRegression(), KNeighborsClassifier(),
#               RandomForestClassifier(), XGBClassifier()]:
train_model(df, LogisticRegression())

LogisticRegression()
Training time: 3.436037063598633
Prediction time (validation): 0.05784320831298828
Accuracy score train set : 0.9836534532080098
Accuracy score valid set  : 0.8368055555555556
Validation classification report: 
              precision    recall  f1-score   support

           0       0.80      0.91      0.85       435
           1       0.89      0.77      0.82       429

    accuracy                           0.84       864
   macro avg       0.84      0.84      0.84       864
weighted avg       0.84      0.84      0.84       864

Test classification report: 

 -------------------------------------------------------------------------------------- 



In [5]:
train_model(df, KNeighborsClassifier())

KNeighborsClassifier()
Training time: 0.21146583557128906
Prediction time (validation): 1.0922200679779053
Accuracy score train set : 0.8475684511646915
Accuracy score valid set  : 0.6481481481481481
Validation classification report: 
              precision    recall  f1-score   support

           0       0.64      0.71      0.67       435
           1       0.66      0.59      0.62       429

    accuracy                           0.65       864
   macro avg       0.65      0.65      0.65       864
weighted avg       0.65      0.65      0.65       864

Test classification report: 

 -------------------------------------------------------------------------------------- 



In [6]:
train_model(df, RandomForestClassifier())

RandomForestClassifier()
Training time: 30.289986610412598
Prediction time (validation): 0.1655571460723877
Accuracy score train set : 1.0
Accuracy score valid set  : 0.7743055555555556
Validation classification report: 
              precision    recall  f1-score   support

           0       0.72      0.91      0.80       435
           1       0.88      0.63      0.74       429

    accuracy                           0.77       864
   macro avg       0.80      0.77      0.77       864
weighted avg       0.80      0.77      0.77       864

Test classification report: 

 -------------------------------------------------------------------------------------- 



In [7]:
train_model(df, XGBClassifier())



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Training time: 60.81608533859253
Prediction time (validation): 0.23636651039123535
Accuracy score train set : 0.8220269718022067
Accuracy score valid set  : 0.7511574074074074
Validation classification report: 
              precision    recall  f1-score   support

           0       0.70      0.89      0.78       435
           1       0.84      0.61      0.7