In [2]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sys
import joblib
import glob



In [5]:



def data_preprocess(data):
  '''We input the data with the linguistic features
  And it returns the data with the polarity columns in 0 for fake and 1 for true
  as well as the outcomes
  '''

  if data.Polarity[0] != 0 or 1:
    data.Polarity[data.Polarity == 'Fake'] = 0
    data.Polarity[data.Polarity == 'TRUE'] = 1

  outcomes = ["Fake","Real"]
  # print("unique Polairty labels:", data.Polarity.unique())

  return data, outcomes


def split_the_test(data, features):

  '''We input the data with the features and a list of the features we want to pass to the models.
  it returns the data split in test/train
  '''

  data = data.dropna()
  feature_cols = features

  X = data[feature_cols]
  y = data.Polarity #outcomes 0 or 1

  return X, y


def class_report(y_test, y_pred):

  outcomes = ["Fake","Real"]

  scores = classification_report(y_test, y_pred, target_names=outcomes)

  print(scores)

  return scores


def conf_matrix(model_name, y_pred, y, cmap="magma"):

  '''This function takes the predicted and test labels, generates the confusion matrix
  and displays it
  '''


  confussion_matrix = confusion_matrix(y_pred, y)

  outcomes = ["Fake","Real"]
  ticks = np.arange(len(outcomes))

  fig, ax = plt.subplots()
  plt.xticks(ticks, outcomes)
  plt.yticks(ticks, outcomes)
  sns.heatmap(pd.DataFrame(confussion_matrix), annot=True, cmap=cmap, fmt="g", xticklabels=outcomes, yticklabels=outcomes)
  ax.xaxis.set_label_position("top")
  plt.tight_layout()
  plt.title("{} Confusion Matrix: {}".format(model_name), y = 1.1)

  plt.ylabel("Actual label")
  plt.xlabel("Predicted label")

  return confussion_matrix

def eval_models(model, X, y):

  '''this fx trains the data on the four types of models,
  generates a report with the overall accuracy of the model, the cross validation evaluation
  and prints the confussion matrix of each model on the particular test_train split.

  It returns a dictionary with the name of the model, the features that were passed during the fit,
  the model used, the accuracy score, the classification report, the crossvalidation evaluation and the confusion matrix
  '''

  model_list_dicts =[]


  model_name = str(model)

  model_dict = {}


  model_dict["model"] = model_name

  print("\n",model_name, "\n", "\n")

  y_pred = model.predict(X, y)

  accuracy = accuracy_score(y, y_pred)

  model_dict["accuracy"] = accuracy

  print("OVERALL ACCURACY", model_name, ":", round(accuracy*100, 2),"%""\n")

  scores = class_report(y, y_pred)

  model_dict["report"] = scores

  # print(scores)

  print("\n")

  confussion_matrix = conf_matrix(model_name, features, y_pred, y, cmap="magma")

  model_dict["confussion_matrix"] = confussion_matrix

  model_list_dicts.append(model_dict)

  return model_list_dicts

def magic(data, model, features):

  '''this function takes all the previous functions and integrates them into a single function to run the data processing, training and testing in all the models.

  It returns a dictionary with the name of the model, the features that were passed during the fit,
  the model used, the accuracy score, the classification report, the crossvalidation evaluation and the confusion matrix

  '''

  data, outcomes = data_preprocess(data) # data in the correct format for the y outcomes

  X, y = split_the_test(data, features) # here we have the data split for all the models with the desired features

  model_list_dicts = eval_models(model, X, y) # for each model it will compute

  return model_list_dicts

In [7]:
test_data = pd.read_csv(r"C:\Users\alber\Desktop\Make Believe Diciembre\Data\Model Testing\data_pre_tp_test.csv")

model, features, y_ref = joblib.load(r"C:\Users\alber\Desktop\Make Believe Diciembre\Models\DESPLd__DESWLltd__WORD_PROPERTY_WRDHYPn__KNeighborsClassifier.pkl")


In [8]:
test_data

Unnamed: 0.1,Unnamed: 0,id,DESPC,DESSC,DESWC,DESPL,DESPLd,DESPLw,DESSL,DESSLd,...,WORD_PROPERTY_WRDPOLc,WORD_PROPERTY_WRDHYPn,WORD_PROPERTY_WRDHYPv,WORD_PROPERTY_WRDHYPnv,WORD_PROPERTY_AOA,WORD_PROPERTY_AOA_MAX,WORD_PROPERTY_CONCRETENESS,WORD_PROPERTY_PREVALENCE,WORD_PROPERTY_PREVALENCE_MIN,WORD_SET_INCIDENCE_C4_COMMON_WORDS
0,0,0,32.0,87.0,1366.0,2.718750,1.689328,41.687500,15.701149,9.903153,...,7.361516,6.241860,1.155405,4.168044,5.702507,17.19,2.344524,2.303225,2.303225,0.183016
1,1,0,1.0,1.0,1.0,1.000000,,1.000000,1.000000,,...,,,,,,,,,,0.000000
2,2,1,9.0,26.0,474.0,2.888889,2.147350,51.777778,18.230769,9.003589,...,7.392713,6.000000,1.539683,3.821705,5.577017,13.95,2.452883,2.264898,2.264898,0.137131
3,3,1,1.0,1.0,1.0,1.000000,,1.000000,1.000000,,...,,,,,,,,,,0.000000
4,4,2,17.0,17.0,453.0,1.888889,0.927961,49.444444,26.647059,7.849691,...,7.248000,6.076923,1.057692,4.069231,5.564612,14.40,2.484831,2.309875,2.309875,0.176600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12660,12660,6332,1.0,1.0,1.0,1.000000,,1.000000,1.000000,,...,,,,,,,,,,0.000000
12661,12661,6333,53.0,43.0,1173.0,1.592593,0.971092,42.444444,27.279070,13.272466,...,6.973396,6.081897,1.553191,4.369973,5.822636,16.50,2.473843,2.305394,2.305394,0.163683
12662,12662,6333,1.0,1.0,1.0,1.000000,,1.000000,1.000000,,...,,,,,,,,,,0.000000
12663,12663,6334,49.0,54.0,897.0,2.160000,1.491085,34.920000,16.611111,8.841191,...,8.493697,6.514925,1.370690,4.128000,5.453634,15.44,2.516343,2.291797,2.291797,0.136009


In [None]:
magic(test_data, model, features)