In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report


### Load dataset

In [22]:
true_news = pd.read_csv('True.csv/True.csv')
fake_news = pd.read_csv('Fake.csv/Fake.csv')

In [23]:
true_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [24]:
fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [25]:
## Add label to the data
true_news['label'] = 1
fake_news['label'] = 0

In [26]:
## Combine both dataset
df = pd.concat([true_news, fake_news], axis=0).reset_index(drop=True)

In [27]:
## Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,title,text,subject,date,label
0,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"January 19, 2017",1
1,Clinton blasts Trump for 'casual inciting of v...,WASHINGTON (Reuters) - Democratic presidential...,politicsNews,"August 10, 2016",1
2,Top Iraqi Shi'ite cleric says paramilitaries s...,BAGHDAD (Reuters) - Iraqi Shi ite paramilitary...,worldnews,"December 15, 2017",1
3,George Takei’s Powerful Message To Liberals: ...,"Actor, activist and one of liberal America s s...",News,"April 28, 2016",0
4,N. KOREA’S LATEST MISSILE LAUNCH Aimed At Test...,North Korea said on Monday it had successfully...,politics,"May 15, 2017",0


## Text Processing

In [28]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define these once, outside the function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()  # Merge multiple spaces
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)


KeyboardInterrupt: 

In [None]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,A DEFENSIVE JOHN KERRY Asks That Israel Move I...,john kerry gave long winded speech today claim...,Government News,"Dec 28, 2016",0
1,Potential New Hampshire spoiler Kasich could p...,manchester n h reuters u republican presidenti...,politicsNews,"February 8, 2016",1
2,AIRPORT PASSENGER “PAT DOWNS” Get More Intrusi...,one woman awful experience airport could soon ...,Government News,"Aug 23, 2017",0
3,"WATCH: GOP Governor Gets Mic Shut Off, Kicked...",republican governor rick scott florida big smi...,News,"March 10, 2016",0
4,China gives greenlight to dozens of Trump trad...,shanghai washington reuters china granted prel...,politicsNews,"March 8, 2017",1


## Train-Test Split

In [30]:
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y.shape

(44898,)

### Using Word2Vec

In [None]:
import gensim
from gensim.models import Word2Vec, keyedvectors

### Generate Word2Vec Embeddings


In [34]:
from gensim.utils import simple_preprocess

# Tokenize the data for Word2Vec training
X_train_tokens = [simple_preprocess(doc) for doc in X_train]
X_test_tokens = [simple_preprocess(doc) for doc in X_test]

#Train Word2Vec model on the training data tokens
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=2, workers=4)

### Average Word2Vec

In [39]:
# Function to compute average Word2Vec embeddings for a documnet

def get_avg_word2vec(tokens_list, model, vector_size):
    embeddings = [model.wv[word] for word in tokens_list if word in model.wv]
    if(len(embeddings) > 0):
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(100)

In [40]:
## Compute embeddings for training and testing data
X_train_word2vec = np.array([get_avg_word2vec(tokens,word2vec_model, 100) for tokens in X_train_tokens])
X_test_word2vec = np.array([get_avg_word2vec(tokens,word2vec_model, 100) for tokens in X_test_tokens])

### Vectorize Text Using TF-IDF

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data and transform both train and test data
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()
# Check the shape of the vectorized data
print(X_train_tfidf.shape, X_test_tfidf.shape)

(35918, 5000) (8980, 5000)


In [51]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_word2vec)
X_test_scaled = scaler.transform(X_test_word2vec)

## Train using Logistic regression

In [53]:
from sklearn.linear_model import LogisticRegression

# Train the model on the training data
model = LogisticRegression()

### Logistic regression on Tf-Idf

In [54]:
model.fit(X_train_tfidf,y_train)

# Predict on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model's performance
print(accuracy_score(y_pred,y_test))
print()
print(classification_report(y_test,y_pred))

0.9871937639198218

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4651
           1       0.99      0.99      0.99      4329

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



#### Logistic regerssion on Word2Vec

In [55]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['Fake', 'True']))

0.961804008908686
              precision    recall  f1-score   support

        Fake       0.96      0.96      0.96      4651
        True       0.96      0.96      0.96      4329

    accuracy                           0.96      8980
   macro avg       0.96      0.96      0.96      8980
weighted avg       0.96      0.96      0.96      8980



### Train Naive-Bayes Model for Word2Vec

In [56]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

nb_model = MultinomialNB()


nb_model.fit(X_train_scaled, y_train)
y_pred = nb_model.predict(X_test_scaled)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['Fake', 'True']))


0.8782850779510022
              precision    recall  f1-score   support

        Fake       0.83      0.96      0.89      4651
        True       0.95      0.79      0.86      4329

    accuracy                           0.88      8980
   macro avg       0.89      0.88      0.88      8980
weighted avg       0.89      0.88      0.88      8980



### Train Naive-Bayes Model for Tf-Idf

In [58]:
nb_model2 = MultinomialNB()
nb_model2.fit(X_train_tfidf, y_train)

y_pred2 = nb_model2.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

0.9345211581291759
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      4651
           1       0.94      0.93      0.93      4329

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980

