# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

import warnings
 
warnings.filterwarnings(action='ignore')

# Processing Data

In [31]:
fake_news = pd.read_csv("Fake.csv")
true_news = pd.read_csv("True.csv")
# add labels
fake_news["label"] = "Fake"
true_news["label"] = "True"

# combine the two separate dataframes
full_data = pd.concat([fake_news, true_news], ignore_index = True)
corpus = full_data["text"]
tokenized_corpus = [simple_preprocess(sentence) for sentence in corpus]

stop_words = set(stopwords.words('english'))  # NLTK stop words

filtered_corpus = [
    [word for word in sentence if word.lower() not in stop_words]
    for sentence in tokenized_corpus
]

In [32]:
full_data.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Fake


# Run and Save Embeddings

Word2Vec is a word embedding technique that uses shallow neural networks to map words to vectors such that similar words are closer together. This ensures that the semantic meaning is captured by the embedding since words with similar meanings have similar vector representations. There are two main models that fall under Word2Vec: continuous bag of words (CBOW) and skip-gram.

CBOW predicts the target words based on its surrounding context. Given an input of surrounding words within a certain window of the target word, the model predicts the target word.

Skip-gram is the reverse of CBOW. Given a word, the model determines what the surrounding context is. We tried both embedding techniques to determine which one would enhance performance.

In [28]:
model_cbow = Word2Vec(sentences=filtered_corpus, vector_size=100, window=5, min_count=1, workers=4, sg=0)
model_sg = Word2Vec(sentences=filtered_corpus, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Parameters:
# - vector_size: Dimensionality of word embeddings
# - window: Context window size
# - min_count: Ignores words appearing less than min_count times
# - workers: Number of CPU cores to use
# - sg: 0 for CBOW (Continuous Bag of Words), 1 for Skip-gram

model_cbow.save("word2vec_cbow.model")
model_sg.save("word2vec_sg.model")

# Fit Models

## CBOW

In [39]:
# Load the model
cbow = Word2Vec.load("word2vec_cbow.model")

def sentence_to_vector(sentence, model):
    """Convert a sentence to a vector by averaging word embeddings."""
    words = simple_preprocess(sentence)  # Tokenization
    word_vectors = [model.wv[word] for word in words if word in model.wv]

    if word_vectors:
        return np.mean(word_vectors, axis=0)  # Average word vectors
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words are in the model

X = full_data["text"]
y = full_data["label"]

X_vectors = np.array([sentence_to_vector(sentence, cbow) for sentence in X])

X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size = 0.25, random_state = 20, stratify = y)

### Logistic Regression

In [43]:
# Train logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Accuracy: {accuracy:.2f}")

Logistic Accuracy: 0.96


### SVM With Linear Kernel

In [44]:
# Train Linear SVM model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Predict on test data
y_pred = svm_model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Linear SVM Accuracy: {accuracy:.2f}")

Linear SVM Accuracy: 0.96


## Skip-grams

In [45]:
sg = Word2Vec.load("word2vec_sg.model")

X_vectors_sg = np.array([sentence_to_vector(sentence, sg) for sentence in X])

X_train_sg, X_test_sg, y_train_sg, y_test_sg = train_test_split(X_vectors_sg, y, test_size = 0.25, random_state = 20, stratify = y)

### Logistic Regression

In [48]:
# Train logistic regression model
clf_sg = LogisticRegression()
clf_sg.fit(X_train_sg, y_train_sg)

# Predict on test data
y_pred_sg = clf_sg.predict(X_test_sg)

# Evaluate accuracy
accuracy_sg = accuracy_score(y_test_sg, y_pred_sg)
print(f"Logistic Accuracy: {accuracy_sg:.2f}")

Logistic Accuracy: 0.96


### SVM With Linear Kernel

In [49]:
# Train Linear SVM model
svm_model_sg = LinearSVC()
svm_model_sg.fit(X_train_sg, y_train_sg)

# Predict on test data
y_pred_sg = svm_model_sg.predict(X_test_sg)

# Evaluate accuracy
accuracy_sg = accuracy_score(y_test_sg, y_pred_sg)
print(f"Linear SVM Accuracy: {accuracy_sg:.2f}")

Linear SVM Accuracy: 0.96
