# Find top 10000 vocab + Logistic Rgression

End of week 10: tasks 0-2

In [1]:
import re
import numpy as np
import pandas as pd
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import requests
import ast

# Download nltk packages if necessary
# nltk.download('punkt')
# nltk.download('stopwords')

In [None]:
# ---- 1. Load and prepare data ----

# Mapping of types to "fake" or "reliable"
type_mapping = {
    'unreliable': 'fake',
    'fake': 'fake',
    'conspiracy': 'fake',
    'bias': 'fake',
    'junksci': 'fake',
    'clickbait': 'reliable',
    'reliable' : 'reliable',
    'state': 'fake',
    'political': 'reliable',
    'satire': 'fake',
    'hate': 'fake',
    'rumor': 'fake',
}

# Load data and filter relevant columns
data_train = pd.read_csv("15,000_rows_preprocessed_train.csv", usecols=["content", "type"], dtype=str)
data_valid = pd.read_csv("15,000_rows_preprocessed_valid.csv", usecols=["content", "type"], dtype=str)
data_test = pd.read_csv("15,000_rows_preprocessed_test.csv", usecols=["content", "type"], dtype=str)
#data = pd.read_csv("995,000_rows_preprocessed.csv", usecols=["content", "type"], dtype=str)



# Map types to labels
data_train["label"] = data_train["type"].map(type_mapping)
data_valid["label"] = data_valid["type"].map(type_mapping)
data_test["label"] = data_test["type"].map(type_mapping)




# Logistic Regression

In [15]:
# Import libraries
from sklearn.preprocessing import MaxAbsScaler
import re
import numpy as np
import pandas as pd
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


In [8]:

# Finding the top 10,000 words

content_as_lists = data_train['content'].apply(ast.literal_eval)

all_words = content_as_lists.explode().tolist()

word_counts = Counter(all_words)

top_10000_words = [word for word, _ in word_counts.most_common(10000)]



In [9]:
# Transform training, valid og test data
vectorizer = CountVectorizer(vocabulary=top_10000_words)

X_train = vectorizer.transform(data_train['content'])
X_valid = vectorizer.transform(data_valid['content'])
X_test = vectorizer.transform(data_test['content'])

### Convert text to Bag-of-Words feature matrix

In [None]:
# Convert to dense matrix (since StandardScaler does not support sparse matrices directly)
# X_train_dense = X_train.toarray()
# X_valid_dense = X_valid.toarray()
# X_test_dense = X_test.toarray()

scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Labels
y_train = data_train['label']
y_valid = data_valid['label']
y_test = data_test['label']

# ---- 4. Train Logistic Regression model ----

logistic_model = LogisticRegression(max_iter=100000, solver='saga', random_state=42)
logistic_model.fit(X_train_scaled, y_train)




In [None]:
# Evaluate with validation
# ---- 6. Validate model ---- (remove hashtags)

# y_pred = clf.predict(X_valid_scaled)

# print(classification_report(y_valid, y_pred))

# from sklearn.metrics import confusion_matrix
# print(confusion_matrix(y_valid, y_pred))


In [18]:
# Evaluate with test
# ---- 5. Evaluate model ----

y_pred = logistic_model.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("Total F1-score:", f1)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Total F1-score: 0.7805091819904315
Classification Report:
              precision    recall  f1-score   support

        fake       0.77      0.84      0.80       672
    reliable       0.80      0.72      0.76       610

    accuracy                           0.78      1282
   macro avg       0.78      0.78      0.78      1282
weighted avg       0.78      0.78      0.78      1282

Confusion Matrix:
[[564 108]
 [172 438]]


# Task 3: Load BBC-data and add to dataset

In [None]:

# bbc_data = pd.read_csv("BBC_preprocessed.csv", usecols=["content", "type"], dtype=str)
# data_train_with_BBC = pd.concat([data_train, bbc_data], ignore_index=True)