In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from time import time
import pandas as pd

In [18]:
def train_model(data, data_test, model):
    X_train, X_valid, y_train, y_valid = train_test_split(data.drop('is_fake', axis=1), data['is_fake'],
                                                          test_size=0.15, random_state=13)

    X_test = data_test.drop('is_fake', axis=1)
    y_test = data_test['is_fake']

    t0 = time()
    model.fit(X_train, y_train)
    t1 = time()
    y_pred_valid = model.predict(X_valid)
    t2 = time()
    y_pred_test = model.predict(X_test)
    t3 = time()

    time_train = t1 - t0
    time_pred_valid = t2 - t1
    time_pred_test = t3 - t2

    print(model)
    print(f'Training time: {time_train}')
    print(f'Prediction time (validation): {time_pred_valid}')
    print(f'Prediction time (test): {time_pred_test} \n')
    print('Accuracy score train set :', model.score(X_train, y_train))
    print('Accuracy score valid set  :', accuracy_score(y_valid, y_pred_valid))
    print('Accuracy score test set  :', accuracy_score(y_test, y_pred_test), '\n')
    print('Validation classification report: ')
    print(classification_report(y_valid, y_pred_valid))
    print('Test classification report: ')
    print(classification_report(y_test, y_pred_test))
    
    print('\n -------------------------------------------------------------------------------------- \n')

In [19]:
path = 'new data\\'
name_train = 'new_df_train.csv'
name_test = 'new_df_test.csv'

df = pd.read_csv(path + name_train)
df_test = pd.read_csv(path + name_test)

In [20]:
# for model in [LogisticRegression(), KNeighborsClassifier(),
#               RandomForestClassifier(), XGBClassifier()]:
train_model(df, df_test, LogisticRegression())

LogisticRegression()
Training time: 3.893789052963257
Prediction time (validation): 0.07081007957458496
Prediction time (test): 0.08377623558044434 

Accuracy score train set : 0.9836534532080098
Accuracy score valid set  : 0.8368055555555556
Accuracy score test set  : 0.538 

Validation classification report: 
              precision    recall  f1-score   support

           0       0.80      0.91      0.85       435
           1       0.89      0.77      0.82       429

    accuracy                           0.84       864
   macro avg       0.84      0.84      0.84       864
weighted avg       0.84      0.84      0.84       864

Test classification report: 
              precision    recall  f1-score   support

           0       1.00      0.54      0.70      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.54      1000
   macro avg       0.50      0.27      0.35      1000
weighted avg       1.00      0.54      0.70      1000


 --

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
train_model(df, df_test, KNeighborsClassifier())

KNeighborsClassifier()
Training time: 0.19546937942504883
Prediction time (validation): 1.1229655742645264
Prediction time (test): 1.72438645362854 

Accuracy score train set : 0.8475684511646915
Accuracy score valid set  : 0.6481481481481481
Accuracy score test set  : 0.569 

Validation classification report: 
              precision    recall  f1-score   support

           0       0.64      0.71      0.67       435
           1       0.66      0.59      0.62       429

    accuracy                           0.65       864
   macro avg       0.65      0.65      0.65       864
weighted avg       0.65      0.65      0.65       864

Test classification report: 
              precision    recall  f1-score   support

           0       1.00      0.57      0.73      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.57      1000
   macro avg       0.50      0.28      0.36      1000
weighted avg       1.00      0.57      0.73      1000


 --

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
train_model(df, df_test, RandomForestClassifier())

RandomForestClassifier()
Training time: 32.12558197975159
Prediction time (validation): 0.23134827613830566
Prediction time (test): 0.22451448440551758 

Accuracy score train set : 0.9997956681651001
Accuracy score valid set  : 0.7731481481481481
Accuracy score test set  : 0.615 

Validation classification report: 
              precision    recall  f1-score   support

           0       0.71      0.91      0.80       435
           1       0.88      0.63      0.73       429

    accuracy                           0.77       864
   macro avg       0.80      0.77      0.77       864
weighted avg       0.80      0.77      0.77       864

Test classification report: 
              precision    recall  f1-score   support

           0       1.00      0.61      0.76      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.61      1000
   macro avg       0.50      0.31      0.38      1000
weighted avg       1.00      0.61      0.76      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
train_model(df, df_test, XGBClassifier())



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Training time: 99.88963508605957
Prediction time (validation): 0.7898859977722168
Prediction time (test): 0.8227989673614502 

Accuracy score train set : 0.8220269718022067
Accuracy score valid set  : 0.7511574074074074
Accuracy score test set  : 0.601 

Validation classification report: 
              precision    recall  f1-score   support

           0     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
