# Find top 10000 vocab + Logistic Rgression


In [None]:
import pandas as pd
import joblib
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from scipy import sparse
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

Task 1: Evaluation on simple logistic regression model, on fake news corpus test set


In [None]:
#Load the logistic regression model 
logistic_model = joblib.load('logistic_model.joblib')

In [None]:
#Load data test set 
X_test_scaled = sparse.load_npz("X_test_scaled.npz")
y_test = pd.read_csv("y_test.csv")

In [None]:
# Evaluate with test
y_pred = logistic_model.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("Total F1-score:", f1)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Task: 2, evaluate the logistic model on liar data 

Load "LIAR" dataset

In [None]:
# Load preprocessed LIAR-dataset
liar_data = pd.read_csv("liar_dataset/liar_full_dataset_preprocessed.csv", usecols=['type', 'content'], dtype=str)

In the liar dataset, the 'content' variable has been cleaned and tokenized - and stop words has 
been removed and all tokens has been stemmed. This has been done with the use of our preprocessing
pipeline. We still need to remove rows where either 'type' or 'content' is NaN, though. And we need to 
remove duplicated rows:

In [None]:
# Remove data points where either 'type' or 'content' is NaN
liar_data = liar_data[liar_data['type'].notna() & liar_data['content'].notna()]

# Remove duplicates
liar_data = liar_data.drop_duplicates()

In [None]:
type_mapping_liar = {
    'true': 'reliable', 
    'false': 'fake', 
    'half-true': 'fake', 
    'pants-fire': 'fake', 
    'barely-true': 'reliable',
    'mostly-true': 'reliable'
}

In [None]:
liar_data["label"] = liar_data["type"].map(type_mapping_liar)

In [None]:
# Load the top_10000_words
with open("top_10000_words.json", "r", encoding="utf-8") as fil:
    top_10000_words = json.load(fil)

In [None]:
vectorizer = CountVectorizer(vocabulary=top_10000_words)

In [None]:
# Transform liar data
X_liar = vectorizer.transform(liar_data['content'])

In [None]:
# Load scaleren
scaler = joblib.load('scaler.pkl')

In [None]:
# Scale data
X_liar_scaled = scaler.transform(X_liar)

In [None]:
y_liar = liar_data['label']

In [None]:
y_pred_liar = logistic_model.predict(X_liar_scaled)

In [None]:
#Evaluation on liar data 
f1_liar = f1_score(y_liar, y_pred_liar, average='weighted')
print("Total F1-score:", f1_liar)

print("Classification Report:")
print(classification_report(y_liar, y_pred_liar))

print("Confusion Matrix:")
print(confusion_matrix(y_liar, y_pred_liar))

# Logistic Regression

In [None]:
from sklearn.preprocessing import MaxAbsScaler
import re
import numpy as np
import pandas as pd
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

# ---- 3. Convert text to Bag-of-Words feature matrix ----

content_as_lists = data['content'].apply(ast.literal_eval)

all_words = content_as_lists.explode().tolist()

word_counts = Counter(all_words)

top_10000_words = [word for word, _ in word_counts.most_common(10000)]

vectorizer = CountVectorizer(vocabulary=top_10000_words)


In [None]:
# Transform training, valid og test data
X_train = vectorizer.transform(train['content'])
X_valid = vectorizer.transform(valid['content'])
X_test = vectorizer.transform(test['content'])

# Transform LIAR test data to Bag-of-Words
# X_test_liar = vectorizer.transform(liar_test_data['content'])

In [None]:
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)
# LIAR not scaled

# Labels
y_train = train['label']
y_valid = valid['label']
y_test = test['label']

y_test_liar = liar_test_data['label']

# ---- 4. Train Logistic Regression model ----

clf = LogisticRegression(max_iter=100000, solver='saga', random_state=42)
clf.fit(X_train_scaled, y_train)

# ---- 5. Evaluate model ----

y_pred_liar = clf.predict(X_test_liar)

# ---- 6. Evaluate model on LIAR test. Notice that it isn't scaled since scaling gave an even worse result ----

f1 = f1_score(y_test_liar, y_pred_liar, average='weighted')  # eller 'macro'
print("Total F1-score: LIAR dataset:", f1)

print("Classification Report:")
print(classification_report(y_test_liar, y_pred_liar))

from sklearn.metrics import confusion_matrix

print("Confusion Matrix:")
print(confusion_matrix(y_test_liar, y_pred_liar))



# ---- Original test with test set (remove # for comparison)----
# print("Original test with test set - classification report")
# print(classification_report(y_test, clf.predict(X_test)))