# Task 1 - evaluate logistic regression model on FakeNewsCorpus test data


In [11]:
# import libraries
import pandas as pd
import joblib
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from scipy import sparse
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [2]:
#Load the logistic regression model 
logistic_model = joblib.load('misc/logistic_model.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
#Load data test set 
X_test_scaled = sparse.load_npz("misc/X_test_scaled.npz")
y_test = pd.read_csv("misc/y_test.csv")

In [4]:
# Evaluate with test
y_pred = logistic_model.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("Total F1-score:", f1)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Total F1-score: 0.8093849082493505
Classification Report:
              precision    recall  f1-score   support

        fake       0.90      0.82      0.86     35942
    reliable       0.64      0.77      0.70     15085

    accuracy                           0.80     51027
   macro avg       0.77      0.80      0.78     51027
weighted avg       0.82      0.80      0.81     51027

Confusion Matrix:
[[29411  6531]
 [ 3435 11650]]


# Task 2 - evaluate the logistic regression model on the LIAR data 

In [6]:
# Load preprocessed LIAR-dataset
liar_data = pd.read_csv("liar_data/liar_full_dataset_preprocessed.csv", usecols=['type', 'content'], dtype=str)

In the liar dataset, the 'content' variable has been cleaned and tokenized - and stop words has 
been removed and all tokens has been stemmed. This has been done with the use of our preprocessing
pipeline. We still need to remove rows where either 'type' or 'content' is NaN, though. And we need to 
remove duplicated rows:

In [7]:
# Remove data points where either 'type' or 'content' is NaN
liar_data = liar_data[liar_data['type'].notna() & liar_data['content'].notna()]

# Remove duplicates
liar_data = liar_data.drop_duplicates()

In [8]:
# Map types to 'fake' or 'reliable'
type_mapping_liar = {
    'true': 'reliable', 
    'false': 'fake', 
    'half-true': 'fake', 
    'pants-fire': 'fake', 
    'barely-true': 'reliable',
    'mostly-true': 'reliable'
}

liar_data["label"] = liar_data["type"].map(type_mapping_liar)

In [None]:
# Load the top_10000_words (top 10000 in FakeNewsCorpus train data)
with open("misc/top_10000_words.json", "r", encoding="utf-8") as fil:
    top_10000_words = json.load(fil)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# Convert 'content' to Bag-of-Words feature matrix
vectorizer = CountVectorizer(vocabulary=top_10000_words)
X_liar = vectorizer.transform(liar_data['content'])

In [None]:
# Load the scaler (made in Part 2 on the FakeNewsCorpus training data)
scaler = joblib.load('misc/scaler.pkl')

In [None]:
# Scale data
X_liar_scaled = scaler.transform(X_liar)

In [None]:
# Create label object
y_liar = liar_data['label']

In [None]:
# Make prediction
y_pred_liar = logistic_model.predict(X_liar_scaled)

In [None]:
# Evaluation on liar data 
f1_liar = f1_score(y_liar, y_pred_liar, average='weighted')
print("Total F1-score:", f1_liar)

print("Classification Report:")
print(classification_report(y_liar, y_pred_liar))

print("Confusion Matrix:")
print(confusion_matrix(y_liar, y_pred_liar))