In [None]:
#Importing necessary libraries
!pip install contractions
import pandas as pd
import numpy as np
import re
import nltk
import contractions
from nltk.corpus import stopwords
nltk.download('punkt')


Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
'''Reading dataset and filling missing values'''

df = pd.read_csv('FakeNewsNet.csv.zip')

#Checking whether the dataset has missing values
print('Before filling the missing values initial dataset \n',df.isna().sum())

#missing values are present in news_url and source_domain columns,and filling them with the word missing '''
df['news_url'].fillna('missing', inplace=True)
df['source_domain'].fillna('missing', inplace=True)

print('\nAfter filling the missing values initial dataset \n',df.isna().sum())

#Now the dataset contains no missing values

Before filling the missing values initial dataset 
 title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

After filling the missing values initial dataset 
 title            0
news_url         0
source_domain    0
tweet_num        0
real             0
dtype: int64


In [None]:
'''Clean text function -The function performs the following steps
    i)  Convert to lower case
    ii) Contraction Splitting
    iii) URL Removal
    iv) Remove new lines
    v)  Remove words containing numbers
    vi) Remove extra spaces
    vii)Remove special characters
    viii)stop words removal
'''
import nltk
nltk.download('stopwords')
# Initialize stop words
stop_words = set(stopwords.words('english'))

# Function to expand contractions
def remove_contractions(text):
    return ' '.join([contractions.fix(word) for word in text.split()])

def clean_text(text):
    text = text.lower()  # Convert to lower case
    text = remove_contractions(text)  # Expand contractions
    text = re.sub(r'http\S+', '', text)  # Remove links or urls in text
    text = re.sub(r'\n', ' ', text)  # Remove new lines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

# Apply the cleaning function to the 'title' column
df['cleaned_title'] = df['title'].apply(clean_text)
df.head(15)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,title,news_url,source_domain,tweet_num,real,cleaned_title
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1,kandi burruss explodes rape accusation real ho...
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1,peoples choice awards best red carpet looks
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1,sophia bush sends sweet birthday message one t...
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1,colombian singer maluma sparks rumours inappro...
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1,gossip girl years later upper east siders shoc...
5,Gwen Stefani Got Dumped by Blake Shelton Over ...,www.intouchweekly.com/posts/gwen-stefani-dumpe...,www.intouchweekly.com,45,0,gwen stefani got dumped blake shelton jealousy...
6,Broward County Sheriff Fired For Lying About P...,https://yournewswire.com/broward-county-sherif...,yournewswire.com,124,0,broward county sheriff fired lying parkland
7,Amber Rose Shuts Down French Montana Dating Ru...,www.etonline.com/news/214798_amber_rose_shuts_...,www.etonline.com,4,0,amber rose shuts french montana dating rumors ...
8,Mindy Kaling makes first post-baby appearance ...,https://www.aol.com/article/entertainment/2018...,www.aol.com,59,1,mindy kaling makes first postbaby appearance d...
9,Katharine McPhee Butchers Tony Nominations: “I...,https://www.98online.com/2018/05/02/katharine-...,www.98online.com,10,1,katharine mcphee butchers tony nominations dri...


In [None]:
'''Performing stemming'''

import nltk
from nltk.stem import PorterStemmer

# Download the stopwords
nltk.download('stopwords')

# Initialize PorterStemmer
ps = PorterStemmer()

# Function to apply stemming
def stemming(text):
    stemmed_words = [ps.stem(word) for word in text.split()]
    return ' '.join(stemmed_words)

# Apply the stemming function to the 'cleaned_title' column
df['stemmed_title'] = df['cleaned_title'].apply(stemming)

# Display the first few rows of the DataFrame to verify the changes
df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,news_url,source_domain,tweet_num,real,cleaned_title,stemmed_title
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1,kandi burruss explodes rape accusation real ho...,kandi burruss explod rape accus real housew at...
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1,peoples choice awards best red carpet looks,peopl choic award best red carpet look
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1,sophia bush sends sweet birthday message one t...,sophia bush send sweet birthday messag one tre...
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1,colombian singer maluma sparks rumours inappro...,colombian singer maluma spark rumour inappropr...
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1,gossip girl years later upper east siders shoc...,gossip girl year later upper east sider shock ...


In [None]:
'''performing lemmatization'''

from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to apply lemmatization
def lemmatize(text):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text.split()]
    return ' '.join(lemmatized_words)

df['lemmatized_title'] = df['cleaned_title'].apply(lemmatize)
df.head()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,title,news_url,source_domain,tweet_num,real,cleaned_title,stemmed_title,lemmatized_title
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1,kandi burruss explodes rape accusation real ho...,kandi burruss explod rape accus real housew at...,kandi burruss explodes rape accusation real ho...
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1,peoples choice awards best red carpet looks,peopl choic award best red carpet look,people choice award best red carpet look
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1,sophia bush sends sweet birthday message one t...,sophia bush send sweet birthday messag one tre...,sophia bush sends sweet birthday message one t...
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1,colombian singer maluma sparks rumours inappro...,colombian singer maluma spark rumour inappropr...,colombian singer maluma spark rumour inappropr...
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1,gossip girl years later upper east siders shoc...,gossip girl year later upper east sider shock ...,gossip girl year later upper east siders shock...


I have performed both stemming and lemmatization. After performing both, lemmatization is likely the better choice over stemming. Here’s why:
Lemmatization can contribute to data quality and context-awareness by produces valid words without losing their meaning.

In [None]:
# Removing unnecessary columns
df= df.drop(columns=['news_url','source_domain','tweet_num'])

# Display the cleaned dataframe to verify
df.head()

Unnamed: 0,title,real,cleaned_title,stemmed_title,lemmatized_title
0,Kandi Burruss Explodes Over Rape Accusation on...,1,kandi burruss explodes rape accusation real ho...,kandi burruss explod rape accus real housew at...,kandi burruss explodes rape accusation real ho...
1,People's Choice Awards 2018: The best red carp...,1,peoples choice awards best red carpet looks,peopl choic award best red carpet look,people choice award best red carpet look
2,Sophia Bush Sends Sweet Birthday Message to 'O...,1,sophia bush sends sweet birthday message one t...,sophia bush send sweet birthday messag one tre...,sophia bush sends sweet birthday message one t...
3,Colombian singer Maluma sparks rumours of inap...,1,colombian singer maluma sparks rumours inappro...,colombian singer maluma spark rumour inappropr...,colombian singer maluma spark rumour inappropr...
4,Gossip Girl 10 Years Later: How Upper East Sid...,1,gossip girl years later upper east siders shoc...,gossip girl year later upper east sider shock ...,gossip girl year later upper east siders shock...


In [None]:
#My cleaned dataset after all data preprocessing is as follows:
df.head(10)

Unnamed: 0,title,real,cleaned_title,stemmed_title,lemmatized_title
0,Kandi Burruss Explodes Over Rape Accusation on...,1,kandi burruss explodes rape accusation real ho...,kandi burruss explod rape accus real housew at...,kandi burruss explodes rape accusation real ho...
1,People's Choice Awards 2018: The best red carp...,1,peoples choice awards best red carpet looks,peopl choic award best red carpet look,people choice award best red carpet look
2,Sophia Bush Sends Sweet Birthday Message to 'O...,1,sophia bush sends sweet birthday message one t...,sophia bush send sweet birthday messag one tre...,sophia bush sends sweet birthday message one t...
3,Colombian singer Maluma sparks rumours of inap...,1,colombian singer maluma sparks rumours inappro...,colombian singer maluma spark rumour inappropr...,colombian singer maluma spark rumour inappropr...
4,Gossip Girl 10 Years Later: How Upper East Sid...,1,gossip girl years later upper east siders shoc...,gossip girl year later upper east sider shock ...,gossip girl year later upper east siders shock...
5,Gwen Stefani Got Dumped by Blake Shelton Over ...,0,gwen stefani got dumped blake shelton jealousy...,gwen stefani got dump blake shelton jealousi d...,gwen stefani got dumped blake shelton jealousy...
6,Broward County Sheriff Fired For Lying About P...,0,broward county sheriff fired lying parkland,broward counti sheriff fire lie parkland,broward county sheriff fired lying parkland
7,Amber Rose Shuts Down French Montana Dating Ru...,0,amber rose shuts french montana dating rumors ...,amber rose shut french montana date rumor call...,amber rose shuts french montana dating rumor c...
8,Mindy Kaling makes first post-baby appearance ...,1,mindy kaling makes first postbaby appearance d...,mindi kale make first postbabi appear disneyla...,mindy kaling make first postbaby appearance di...
9,Katharine McPhee Butchers Tony Nominations: “I...,1,katharine mcphee butchers tony nominations dri...,katharin mcphee butcher toni nomin drink,katharine mcphee butcher tony nomination drinking


In [None]:
df.to_csv('preprocessed_dataset.csv', index=False)


In [None]:
df1 = pd.read_csv('preprocessed_dataset.csv')
df1.head()

Unnamed: 0,title,real,cleaned_title,stemmed_title,lemmatized_title
0,Kandi Burruss Explodes Over Rape Accusation on...,1,kandi burruss explodes rape accusation real ho...,kandi burruss explod rape accus real housew at...,kandi burruss explodes rape accusation real ho...
1,People's Choice Awards 2018: The best red carp...,1,peoples choice awards best red carpet looks,peopl choic award best red carpet look,people choice award best red carpet look
2,Sophia Bush Sends Sweet Birthday Message to 'O...,1,sophia bush sends sweet birthday message one t...,sophia bush send sweet birthday messag one tre...,sophia bush sends sweet birthday message one t...
3,Colombian singer Maluma sparks rumours of inap...,1,colombian singer maluma sparks rumours inappro...,colombian singer maluma spark rumour inappropr...,colombian singer maluma spark rumour inappropr...
4,Gossip Girl 10 Years Later: How Upper East Sid...,1,gossip girl years later upper east siders shoc...,gossip girl year later upper east sider shock ...,gossip girl year later upper east siders shock...
