In [1]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
nltk.download("stopwords")
plt.rcParams.update({"font.size": 20})

# Pre-Proccess data

In [3]:
original = pd.read_csv(
    "/Users/vigrel/Git/NLP/nlp-intermediate-exam/data/WELFake_Dataset.csv"
)
original = original.dropna(subset="text")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    return " ".join(
        [
            lemmatizer.lemmatize(word)
            for word in text.split()
            if word.lower() not in stop_words
        ]
    )


original["text"] = original.text.apply(preprocess_text)

In [None]:
data = original.copy()
data.label.value_counts()

# Pipeline Analysis

In [5]:
full_vectorizer = CountVectorizer(
    binary=True,
    lowercase=True,
)

full_X = full_vectorizer.fit_transform(data.text)
full_word_doc_freq = np.asarray(full_X.sum(axis=0)).flatten()

In [13]:
vectorizer = CountVectorizer(
    stop_words="english",
    binary=True,
    min_df=0.05,
    max_df=0.8,
    lowercase=True,
)

X = vectorizer.fit_transform(data.text)
word_doc_freq = np.asarray(X.sum(axis=0)).flatten()

In [None]:
plt.hist(full_word_doc_freq, bins=30, log=True, alpha=0.5, color="orange", label="full")
plt.hist(word_doc_freq, bins=30, log=True, alpha=0.5, color="blue", label="partial")
plt.legend()
plt.xlabel("Document Frequency of Words")
plt.ylabel("Count of Words")
plt.title("Word Document Frequency Distribution")
plt.show()

# Feature Importance Analysis

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data["text"], data["label"], test_size=0.2, random_state=42
)

pipe = Pipeline(
    [
        ("vectorizer", vectorizer),
        ("model", LogisticRegression()),
    ]
)
model = pipe.fit(X_train, y_train)

pred = model.predict(X_test)
sns.heatmap(
    confusion_matrix(y_test, pred),
    annot=True,
    fmt="",
    cmap="Blues",
    xticklabels=["R-News", "F-News"],
    yticklabels=["R-News", "F-News"],
)
plt.xlabel("Predicted Labels")
plt.ylabel("Real Labels")
plt.show()

In [60]:
coefs = pipe["model"].coef_
classes = pipe["model"].classes_
vocabulary = pipe["vectorizer"].vocabulary_

words_and_weights = [
    (coefs[0, idx], word)
    for word, idx in vocabulary.items()
    if not word.isnumeric() and len(word) > 3
]

sorted_tuples = sorted(words_and_weights)
counts, words = zip(*sorted_tuples)

num_words = 10
x_axis = np.arange(num_words)

In [None]:
plt.figure(figsize=(12, 6))
plt.title(
    "Top Features for Reliable News",
)
bars = plt.bar(
    x_axis[0:num_words],
    [abs(c) for c in counts[0:num_words]],
    color="green",
    width=0.6,
    alpha=0.75,
)
plt.xticks(x_axis[0:num_words], words[0:num_words], rotation=70, ha="right")
plt.ylabel("Coefficient Weight")

for bar in bars:
    yval = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        yval,
        -round(yval, 2),
        ha="center",
        va="bottom",
    )

plt.gca().set_yticks([])
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.title("Top Features for Fake News")
bars = plt.bar(
    x_axis[-num_words:], counts[-num_words:], color="red", width=0.6, alpha=0.6
)
plt.xticks(x_axis[-num_words:], words[-num_words:], rotation=70, ha="right")
plt.ylabel("Coefficient Weight")

for bar in bars:
    yval = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        yval,
        round(yval, 2),
        ha="center",
        va="bottom",
    )

plt.gca().set_yticks([])
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)

plt.tight_layout()
plt.show()

# Model Scores 

In [None]:
def train_and_evaluate(data, vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(
        data["text"],
        data["label"],
        test_size=0.2,
        random_state=None,
        stratify=data["label"],
    )

    pipe = Pipeline(
        [
            ("vectorizer", vectorizer),
            ("model", LogisticRegression()),
        ]
    )

    model = pipe.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    return precision, recall, f1, balanced_accuracy


num_iterations = 100
results = Parallel(n_jobs=-1)(
    delayed(train_and_evaluate)(data, vectorizer) for _ in range(num_iterations)
)

precision_scores, recall_scores, f1_scores, balanced_accuracy_scores = zip(*results)

mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)
mean_balanced_accuracy = np.mean(balanced_accuracy_scores)

print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean Recall: {mean_recall:.4f}")
print(f"Mean F1 Score: {mean_f1:.4f}")
print(f"Mean Balanced Accuracy: {mean_balanced_accuracy:.4f}")

# Training Curve

In [None]:
f_list = np.logspace(0, -4, 25)


def get_train_test_accuracy(plot_list, genre_list):
    model_lr = pipe

    X_train, X_test, y_train, y_test = train_test_split(
        plot_list, genre_list, test_size=0.2, random_state=None
    )
    model_lr.fit(X_train, y_train)

    y_train_pred = model_lr.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)

    y_test_pred = model_lr.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    return train_accuracy, test_accuracy


def process_fraction(f, data):
    accuracy_f_train = []
    accuracy_f_test = []
    for _ in range(50):
        df_sample = data.sample(frac=f)
        train_accuracy, test_accuracy = get_train_test_accuracy(
            df_sample["text"], df_sample["label"]
        )

        accuracy_f_train.append(train_accuracy)
        accuracy_f_test.append(test_accuracy)

    return np.mean(accuracy_f_train), np.mean(accuracy_f_test)


def get_accuracy_sample(df, f_list):
    results = Parallel(n_jobs=-1)(delayed(process_fraction)(f, df) for f in f_list)

    train_accuracies, test_accuracies = zip(*results)

    return train_accuracies, test_accuracies


train_accuracies, test_accuracies = get_accuracy_sample(data, f_list)

In [None]:
plt.figure(figsize=(12, 6))
error_train = [1 - x for x in train_accuracies]
error_test = [1 - x for x in test_accuracies]

plt.plot(f_list * len(data), error_test, label="Test")
plt.plot(f_list * len(data), error_train, label="Train")

# plt.gca().spines['top'].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.ylim(ymax=0.15, ymin=0)

plt.gca().set_yticks([0, 0.05, 0.1, 0.15])
plt.title("Training curves")
plt.ylabel("Error rate\n(1 - accuracy)")
plt.xlabel("Number of samples")
plt.axhline(
    y=(error_test[0] + error_train[0]) / 2, color="gray", linestyle="--", alpha=0.3
)
plt.text(
    0,
    0.068,
    round((error_test[0] + error_train[0]) / 2, 3),
    ha="center",
    va="bottom",
    color="gray",
    alpha=0.3,
)

plt.legend()
plt.tight_layout()
plt.show()

# Topic Analysis

In [14]:
nmf = NMF(n_components=2)
X_nmf = nmf.fit_transform(X)

In [None]:
n_words = 20


def print_words_in_topics(nmf, vectorizer):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(nmf.components_):
        print(f"Topic {idx}")
        for i in topic.argsort()[-n_words:]:
            print(words[i])
        print()
    print()


print_words_in_topics(nmf, vectorizer)

* Topic 0: Generic News
* Topic 1: Elections News

In [18]:
def train_and_evaluate(topic_X, topic_y):
    X_train, X_test, y_train, y_test = train_test_split(
        topic_X, topic_y, test_size=0.2, random_state=None
    )

    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    return precision, recall, f1, balanced_accuracy


topic_assignments = np.argmax(X_nmf, axis=1)

for topic in range(nmf.n_components):
    topic_indices = np.where(topic_assignments == topic)[0]
    topic_X = X[topic_indices]
    topic_y = data.label.iloc[topic_indices]

    num_iterations = 100

    results = Parallel(n_jobs=-1)(
        delayed(train_and_evaluate)(topic_X, topic_y) for _ in range(num_iterations)
    )

    precision_scores, recall_scores, f1_scores, balanced_accuracy_scores = zip(*results)

    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)
    mean_f1 = np.mean(f1_scores)
    mean_balanced_accuracy = np.mean(balanced_accuracy_scores)

    print(f"Classification report for topic {topic}:")
    print(f"Mean Precision: {mean_precision:.4f}")
    print(f"Mean Recall: {mean_recall:.4f}")
    print(f"Mean F1 Score: {mean_f1:.4f}")
    print(f"Mean Balanced Accuracy: {mean_balanced_accuracy:.4f}")
    print()



Classification report for topic 0:
Mean Precision: 0.9220
Mean Recall: 0.9231
Mean F1 Score: 0.9222
Mean Balanced Accuracy: 0.8860

Classification report for topic 1:
Mean Precision: 0.9356
Mean Recall: 0.9354
Mean F1 Score: 0.9355
Mean Balanced Accuracy: 0.9284

