### Model Training
After doing the EDA and data cleaning, now it's time for model training,
we will be using the saved dataset after cleaning

In [64]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [48]:
df = pd.read_csv('data\WELFake_Dataset_Cleaned.csv')

In [49]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,text_len,words_count,lang
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,5049,871,en
1,1,,Did they post their votes for Hillary already?,1,46,8,en
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,216,34,en
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,8010,1321,en
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,1916,329,en


#### Calculate the sentiment_score and clean the text

In [None]:
#calculating sentiment score for each text using textBlob
df['sentiment_score'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

In [54]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,text_len,words_count,lang,sentiment_score
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,5049,871,en,0.033103
1,1,,Did they post their votes for Hillary already?,1,46,8,en,0.0
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,216,34,en,0.258929
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,8010,1321,en,0.113865
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,1916,329,en,0.056373


In [60]:
# text cleaning
import re
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [57]:
# Download resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to C:\Users\lmokh
[nltk_data]     phone\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\lmokh
[nltk_data]     phone\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to C:\Users\lmokh
[nltk_data]     phone\AppData\Roaming\nltk_data...


In [61]:
tqdm.pandas()


def clean_text(text):
   # Lowercase
   text = str(text).lower()
   # Remove URLs, punctuation, numbers
   text = re.sub(r"http\S+|www\S+|https\S+", "", text)
   text = re.sub(r"[^a-z\s]", "", text)
   # Tokenize and remove stopwords
   tokens = [word for word in text.split() if word not in stop_words]
   # Lemmatize
   tokens = [lemmatizer.lemmatize(word) for word in tokens]
   return " ".join(tokens)

# Apply cleaning with progress bar
df['clean_text'] = df['text'].progress_apply(clean_text)

  0%|          | 0/62119 [00:00<?, ?it/s]

100%|██████████| 62119/62119 [04:12<00:00, 245.98it/s]


In [None]:
df.to_csv('data/preprocessed_fake_news.csv', index=False)

#### Converting text to numeric and prepar train data and test data

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

In [66]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label', 'text_len', 'words_count',
       'lang', 'sentiment_score', 'clean_text'],
      dtype='object')

In [67]:
# split data into train and test sets
X = df['clean_text']
y = df['label']

numeric_features = df[['sentiment_score','text_len']]
x_train_text, x_test_text, x_train_num, x_test_num, y_train, y_test = train_test_split(X,numeric_features, y, test_size=0.2, random_state=42, stratify=y)

In [68]:
#text vectorization using TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
x_train_text_tfidf = tfidf.fit_transform(x_train_text)
x_test_text_tfidf = tfidf.transform(x_test_text)

#scaling numeric features
scaler = StandardScaler()
x_train_num_scaled = scaler.fit_transform(x_train_num)
x_test_num_scaled = scaler.transform(x_test_num)

#combine text and numeric features
x_train_final = hstack([x_train_text_tfidf, x_train_num_scaled])
x_test_final = hstack([x_test_text_tfidf, x_test_num_scaled])

#### model training

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [71]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train_final, y_train)

In [72]:
y_pred = model.predict(x_test_final)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9472794591113973
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      6924
           1       0.94      0.94      0.94      5500

    accuracy                           0.95     12424
   macro avg       0.95      0.95      0.95     12424
weighted avg       0.95      0.95      0.95     12424



In [73]:
y_pred = model.predict(x_train_final)
print("Accuracy:", accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

Accuracy: 0.9609014991447832
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     27693
           1       0.96      0.96      0.96     22002

    accuracy                           0.96     49695
   macro avg       0.96      0.96      0.96     49695
weighted avg       0.96      0.96      0.96     49695

