# Task 0

\**See report*\*

# Task 1

In [1]:
# Import libraries
import pandas as pd
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import ast
import joblib

# Download nltk packages if necessary
# nltk.download('punkt')
# nltk.download('stopwords')

In [None]:
# ---- 1. Load data and map article types to either "fake" or "reliable" ----

# Mapping of types to "fake" or "reliable"
type_mapping = {
    'unreliable': 'fake',
    'fake': 'fake',
    'conspiracy': 'fake',
    'bias': 'fake',
    'junksci': 'fake',
    'clickbait': 'reliable',
    'reliable' : 'reliable',
    'state': 'fake',
    'political': 'reliable',
    'satire': 'fake',
    'hate': 'fake',
    'rumor': 'fake',
}

# Load data and include only relevant columns
#data_train = pd.read_csv("fakenewscorpus_data/15,000_rows_preprocessed_train.csv", usecols=["content", "type"], dtype=str)
#data_valid = pd.read_csv("fakenewscorpus_data/15,000_rows_preprocessed_valid.csv", usecols=["content", "type"], dtype=str)
#data_test = pd.read_csv("fakenewscorpus_data/15,000_rows_preprocessed_test.csv", usecols=["content", "type"], dtype=str)
data_train = pd.read_csv("fakenewscorpus_data/995,000_rows_preprocessed_train.csv", usecols=["content", "type"], dtype=str)
data_valid = pd.read_csv("fakenewscorpus_data/995,000_rows_preprocessed_valid.csv", usecols=["content", "type"], dtype=str)
data_test = pd.read_csv("fakenewscorpus_data/995,000_rows_preprocessed_test.csv", usecols=["content", "type"], dtype=str)


# Map types to labels
data_train["label"] = data_train["type"].map(type_mapping)
data_valid["label"] = data_valid["type"].map(type_mapping)
data_test["label"] = data_test["type"].map(type_mapping)

In [3]:
# Check if data is balanced
print(data_train['label'].value_counts())

label
fake        286875
reliable    121335
Name: count, dtype: int64


### Make data balanced

In [4]:
# Count occurrences of each class
count_fake = sum(data_train['label'] == 'fake')
count_reliable = sum(data_train['label'] == 'reliable')

# Determine how many rows with 'fake' to keep
rows_to_keep = count_reliable

# Randomly sample rows with label 'fake'
df_fake = data_train[data_train['label'] == 'fake'].sample(n=rows_to_keep, random_state=42)
df_reliable = data_train[data_train['label'] == 'reliable']  # Keep all rows with 'reliable'

# Combine the two classes back into a balanced dataframe
data_train = pd.concat([df_fake, df_reliable])

# Shuffle the resulting dataframe to mix the rows (optional)
data_train = data_train.sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
# Check if data is balanced - again
data_train['label'].value_counts()

label
reliable    121335
fake        121335
Name: count, dtype: int64

### Logistic Regression

In [6]:
# Import libraries
from sklearn.preprocessing import MaxAbsScaler
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from scipy import sparse


In [7]:
# Finding the top 10,000 words

content_as_lists = data_train['content'].apply(ast.literal_eval)

all_words = content_as_lists.explode().tolist()

word_counts = Counter(all_words)

top_10000_words = [word for word, _ in word_counts.most_common(10000)]

In [None]:
# Save top 10,000_words as file

import json

# Save as JSON-file
with open("misc/top_10000_words.json", "w", encoding="utf-8") as fil:
    json.dump(top_10000_words, fil)

print("List saved in the file 'top_10000_words.json'.")

In [None]:
# Convert 'content' to Bag-of-Words feature matrix
vectorizer = CountVectorizer(vocabulary=top_10000_words)

X_train = vectorizer.transform(data_train['content'])
X_valid = vectorizer.transform(data_valid['content'])
X_test = vectorizer.transform(data_test['content'])

In [None]:
# Scale data
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Extract labels
y_train = data_train['label']
y_valid = data_valid['label']
y_test = data_test['label']

# Train Logistic Regression model
logistic_model = LogisticRegression(max_iter=100000, solver='saga', random_state=42)
logistic_model.fit(X_train_scaled, y_train)

In [None]:
# Save 'X_test_scaled' and 'y_test' as files, so they can be 
# used in part 4 (evaluation)

sparse.save_npz("misc/X_test_scaled.npz", X_test_scaled)

y_test.to_csv('misc/y_test.csv', index=False)

In [None]:
# Evaluate with validation data
y_pred = logistic_model.predict(X_valid_scaled)

f1 = f1_score(y_valid, y_pred, average='weighted')
print("Total F1-score:", f1)

print("Classification Report:")
print(classification_report(y_valid, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred))

In [None]:
# Save model as a file
joblib.dump(logistic_model, 'misc/logistic_model.joblib')

# Task 2

\**See report*\*

# Task 3: Load BBC-data and add to dataset

In [None]:
# Load and concatenate BBC data
bbc_data = pd.read_csv("BBC_data/BBC_preprocessed.csv", usecols=["content", "type"], dtype=str)
data_train_with_BBC = pd.concat([data_train, bbc_data], ignore_index=True)

In [None]:
# Find top 10,000 words

content_as_lists = data_train_with_BBC['content'].apply(ast.literal_eval)

all_words = content_as_lists.explode().tolist()

word_counts = Counter(all_words)

top_10000_words = [word for word, _ in word_counts.most_common(10000)]

In [None]:
# Convert 'content' to Bag-of-Words feature matrix
vectorizer = CountVectorizer(vocabulary=top_10000_words)

X_train = vectorizer.transform(data_train['content'])
X_valid = vectorizer.transform(data_valid['content'])
X_test = vectorizer.transform(data_test['content'])

In [None]:
# Scale data
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Extract labels
y_train = data_train['label']
y_valid = data_valid['label']
y_test = data_test['label']

# Train Logistic Regression model
logistic_model = LogisticRegression(max_iter=100000, solver='saga', random_state=42)
logistic_model.fit(X_train_scaled, y_train)

In [None]:
# Evaluate with validation
y_pred = logistic_model.predict(X_valid_scaled)

f1 = f1_score(y_valid, y_pred, average='weighted')
print("Total F1-score:", f1)

print("Classification Report:")
print(classification_report(y_valid, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred))