# Find top 10000 vocab + Logistic Rgression

End of week 10: tasks 0-2

In [17]:
import re
import numpy as np
import pandas as pd
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import requests
import ast

# Download nltk packages if necessary
# nltk.download('punkt')
# nltk.download('stopwords')

In [18]:
# ---- 1. Load and prepare data ----

# Mapping of types to "fake" or "reliable"
type_mapping = {
    'unreliable': 'fake',
    'fake': 'fake',
    'conspiracy': 'fake',
    'bias': 'fake',
    'junksci': 'fake',
    'clickbait': 'reliable',
    'reliable' : 'reliable',
    'state': 'fake',
    'political': 'reliable',
    'satire': 'fake',
    'hate': 'fake',
    'rumor': 'fake',
}

# Load data and filter relevant columns
data = pd.read_csv("15,000_rows_preprocessed.csv", usecols=["content", "type"], dtype=str)
#data = pd.read_csv("995,000_rows_preprocessed.csv", usecols=["content", "type"], dtype=str)

# ---- Load BBC-data and add to dataset ----
bbc_data = pd.read_csv("BBC_preprocessed.csv", usecols=["content", "type"], dtype=str)
data = pd.concat([data, bbc_data], ignore_index=True)

# Remove rows with unknown type
data = data[data['type'] != 'unknown']

# Map types to labels
data["label"] = data["type"].map(type_mapping)

# Remove NaN
data = data.dropna(subset=["label"])

# ---- 2. Split dataset to training, validation and test (80/10/10) ----

train, valid, test = np.split(
    data.sample(frac=1, random_state=42),  # Shuffle
    [int(0.8 * len(data)), int(0.9 * len(data))]  # Index for split
)

  return bound(*args, **kwds)


# Logistic Regression

In [19]:
from sklearn.preprocessing import MaxAbsScaler
import re
import numpy as np
import pandas as pd
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ---- 3. Convert text to Bag-of-Words feature matrix ----

content_as_lists = data['content'].apply(ast.literal_eval)

all_words = content_as_lists.explode().tolist()

word_counts = Counter(all_words)

top_10000_words = [word for word, _ in word_counts.most_common(10000)]

vectorizer = CountVectorizer(vocabulary=top_10000_words)


In [20]:
# Transform training, valid og test data
X_train = vectorizer.transform(train['content'])
X_valid = vectorizer.transform(valid['content'])
X_test = vectorizer.transform(test['content'])

In [None]:
# Convert to dense matrix (since StandardScaler does not support sparse matrices directly)
# X_train_dense = X_train.toarray()
# X_valid_dense = X_valid.toarray()
# X_test_dense = X_test.toarray()

scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Labels
y_train = train['label']
y_valid = valid['label']
y_test = test['label']

# ---- 4. Train Logistic Regression model ----

clf = LogisticRegression(max_iter=100000, solver='saga', random_state=42)
clf.fit(X_train_scaled, y_train)

# ---- 5. Evaluate model ----

y_pred = clf.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

# ---- 6. Validate model ---- (remove hashtags)

# y_pred = clf.predict(X_valid_scaled)

# print(classification_report(y_valid, y_pred))

# from sklearn.metrics import confusion_matrix
# print(confusion_matrix(y_valid, y_pred))


              precision    recall  f1-score   support

        fake       0.77      0.83      0.80       676
    reliable       0.82      0.75      0.79       694

    accuracy                           0.79      1370
   macro avg       0.79      0.79      0.79      1370
weighted avg       0.79      0.79      0.79      1370

[[562 114]
 [171 523]]


In [22]:
# # ---- 5. Indlæs top 10.000 ord fra CSV og konverter tekst til Bag-of-Words feature matrix ----

# # Indlæs ordforrådet fra CSV
# df_vocab = pd.read_csv("top_10000_vocabulary.csv")
# top_10000_words = df_vocab["Word"].tolist()  # Konverter til liste

# # Opret CountVectorizer med det indlæste ordforråd
# vectorizer = CountVectorizer(vocabulary=top_10000_words) # BoW bag of words

# # Transformer træning, valid og test data
# X_train = vectorizer.transform(train['content'])
# X_valid = vectorizer.transform(valid['content'])
# X_test = vectorizer.transform(test['content'])

# # Labels
# y_train = train['label']
# y_valid = valid['label']
# y_test = test['label']

# # ---- 6. Træn Logistic Regression model ----

# clf = LogisticRegression(max_iter=5000, solver='saga', random_state=42)
# clf.fit(X_train, y_train)

# # ---- 7. Evaluer modellen ----

# y_pred = clf.predict(X_test)

# # Udskriv resultater
# print(classification_report(y_test, y_pred))


# Kode kopieret fra YouTube: https://www.youtube.com/watch?v=aL21Y-u0SRs&t=212s

In [23]:
# from matplotlib import pyplot as plt
# import numpy as np

# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(model.score(X_test, y_test))

# from sklearn.metrics import confusion_matrix
# print(confusion_matrix(y_test, y_pred))
# from sklearn.metrics import classification_report
# print(classification_report(y_test, y_pred))

# Gammel version 1... 

In [24]:
# import re
# import numpy as np
# import pandas as pd
# import nltk
# from collections import Counter
# from nltk.stem import PorterStemmer
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

# # Ensure NLTK resources are available
# # nltk.download('punkt')
# # nltk.download('stopwords')

# # Load data efficiently
# data = pd.read_csv("995,000_rows.csv", usecols=["content"], dtype=str)

# # Shuffle and split dataset (80% train, 10% validation, 10% test)
# train, valid, test = np.split(
#     data.sample(frac=1, random_state=42),  # Shuffle dataset
#     [int(0.8 * len(data)), int(0.9 * len(data))]  # Indices for 80% train, 10% valid, 10% test
# )

# # Precompile regex patterns
# date_regex = re.compile(r"""\b(
#     (?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4}) |
#     (?:\d{4}[./-]\d{1,2}[./-]\d{1,2}) |
#     (?:\b\d{1,2}(st|nd|rd|th)?\s+(of\s+)?[A-Za-z]+\s+\d{4}\b) |
#     (?:\b[A-Za-z]+\s+\d{1,2}(st|nd|rd|th)?,?\s+\d{4}\b) |
#     (?:\b[A-Za-z]+\s+\d{4}\b)
# )\b""", re.VERBOSE | re.IGNORECASE)

# number_regex = re.compile(r'\d+')
# email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
# url_regex = re.compile(r'\b(?:http[s]?://|www\.)[^\s<>"]+\b')
# whitespace_regex = re.compile(r'\s+')

# # Function to clean text
# def clean_text(text):
#     text = text.lower()
#     text = whitespace_regex.sub(' ', text)
#     text = date_regex.sub("<DATE>", text)
#     text = url_regex.sub("<URL>", text)
#     text = email_regex.sub("<EMAIL>", text)
#     text = number_regex.sub("<NUM>", text)
#     return text

# # Apply text cleaning to each split
# train['content'] = train['content'].fillna('').map(clean_text)
# valid['content'] = valid['content'].fillna('').map(clean_text)
# test['content'] = test['content'].fillna('').map(clean_text)

# # Tokenization and stopword filtering (only on train set)
# stop_words = set(stopwords.words('english'))
# tokens = [
#     word for sentence in train['content'] 
#     for word in word_tokenize(sentence) 
#     if word.isalpha() and word not in stop_words
# ]

# # Stemming
# stemmer = PorterStemmer()
# stemmed_tokens = [stemmer.stem(word) for word in tokens]

# # Compute word frequencies
# word_counts = Counter(stemmed_tokens)

# # Extract top 10,000 words
# top_10000 = word_counts.most_common(10000)

# # Export to CSV
# df_top_words = pd.DataFrame(top_10000, columns=['Word', 'Frequency'])
# df_top_words.to_csv("top_10000_vocabulary.csv", index=False)

# print("Top 10,000 words saved to top_10000_vocabulary.csv")
