<a href="https://colab.research.google.com/github/RaunakRaj2081/Machine-Learning_part1/blob/main/2_10_Text_Data_Pre_Processing_Use_Case.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#Data Pre-Processing

In [3]:
# Use Python's built-in file handling to inspect the problematic line
file_path = '/content/WELFake_Dataset.csv'
target_line_number = 8288 # Error is at row 8287, which is likely line 8288 including header

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Print lines around the target line number (e.g., 5 lines before and 5 lines after)
    start_index = max(0, target_line_number - 6) # -1 for 0-indexing, -5 to show previous 5 lines
    end_index = min(len(lines), target_line_number + 5) # +5 to show next 5 lines

    print(f"Inspecting lines from {start_index + 1} to {end_index} (0-indexed: {start_index} to {end_index-1}):")
    print("-" * 30) # Separator

    for i in range(start_index, end_index):
        # Print the line number and the raw line content (strip just trailing newline for clarity)
        print(f"Line {i + 1}: {lines[i].rstrip()}")

    print("-" * 30) # Separator

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

Inspecting lines from 8283 to 5006 (0-indexed: 8282 to 5005):
------------------------------
------------------------------


In [4]:
# load the data to a pandas dataframe
# Added engine='python' and on_bad_lines='skip' to handle potential parsing errors
# on_bad_lines='skip' will skip rows that cause parsing issues. Use with caution.
# If you are using pandas version < 1.4.0, use error_bad_lines=False instead of on_bad_lines='skip'
news_data = pd.read_csv('/content/WELFake_Dataset.csv', engine='python', on_bad_lines='skip')

# You can also use 'warn' to see which lines are skipped:
# news_data = pd.read_csv('/content/WELFake_Dataset.csv', engine='python', on_bad_lines='warn')

In [5]:
# first 5 rows of the dataset
news_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


0 --> Real News

1 --> Fake News

In [6]:
news_data.shape

(869, 4)

In [7]:
# checking for missing values
news_data.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,14
text,13
label,15


In [8]:
# replacing the missing values with null string
news_data = news_data.fillna('')

In [9]:
# merging the text and the title
news_data['content'] = news_data['text']+' '+news_data['title']

In [10]:
# first 5 rows of the dataset
news_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,No comment is expected from Barack Obama Membe...
1,1,,Did they post their votes for Hillary already?,1,Did they post their votes for Hillary already?
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,"Now, most of the demonstrators gathered last ..."
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,A dozen politically active pastors came here f...
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"The RS-28 Sarmat missile, dubbed Satan 2, will..."


In [11]:
# separating feature and target
X = news_data.drop(columns='label', axis =1)
Y = news_data['label']

In [12]:
print(X)

    Unnamed: 0                                              title  \
0            0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1            1                                                      
2            2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3            3  Bobby Jindal, raised Hindu, uses story of Chri...   
4            4  SATAN 2: Russia unvelis an image of its terrif...   
..         ...                                                ...   
864        849  LIBERAL RAGS LIKE USA TODAY Working Overtime T...   
865        850  BOOM! BRONCOS PLAYER Has Message For Fellow NF...   
866        851  Ukraine says hopes for continued support again...   
867        852  DONALD RUMSELD HUMILIATES “The View” Dingbat J...   
868        853  TICKING TIME BOMB: Why More Young Muslims In T...   

                                                  text  \
0    No comment is expected from Barack Obama Membe...   
1       Did they post their votes for Hillary already? 

In [13]:
print(Y)

0      1
1      1
2      1
3      0
4      1
      ..
864    1
865    1
866    0
867    1
868    1
Name: label, Length: 869, dtype: object


Stemming:

Stemming is the process of reducing a word to its Root Word

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [16]:
news_data['content'] = news_data['content'].apply(stemming)

In [17]:
print(news_data['content'])

0      comment expect barack obama member fyf fukyofl...
1                              post vote hillari alreadi
2      demonstr gather last night exercis constitut p...
3      dozen polit activ pastor came privat dinner fr...
4      rs sarmat missil dub satan replac ss fli mile ...
                             ...                        
864    figur yet liber main stream media work overtim...
865    clarifi protest unit state right also challeng...
866    kiev reuter ukrainian presid petro poroshenko ...
867    question whoopi goldberg talent actress abil m...
868    statist shock tell pleas sure share piec hilla...
Name: content, Length: 869, dtype: object


In [18]:
X = news_data['content'].values
Y = news_data['label'].values

In [19]:
print(X)

['comment expect barack obama member fyf fukyoflag blacklivesmatt movement call lynch hang white peopl cop encourag other radio show tuesday night turn tide kill white peopl cop send messag kill black peopl america one f yoflag organ call sunshin radio blog show host texa call sunshin f ing opinion radio show snapshot fyf lolatwhitefear twitter page p show urg support call fyf tonight continu dismantl illus white snapshot twitter radio call invit fyf radio show air p eastern standard time show caller clearli call lynch kill white peopl minut clip radio show heard provid breitbart texa someon would like refer hannib alreadi receiv death threat result interrupt fyf confer call unidentifi black man said mother f ker start f ing like us bunch ni er takin one us roll said caus alreadi roll gang anyway six seven black mother f cker see white person lynch ass let turn tabl conspir cop start lose peopl state emerg specul one two thing would happen big ass r war ni er go start backin alreadi ge

In [20]:
print(Y)

['1' '1' '1' '0' '1' '1' '1' '1' '1' '1' '1' '0' '0' '1' '0' '0' '1' '0'
 '1' '0' '1' '1' '1' '1' '1' '1' '0' '0' '0' '1' '0' '0' '1' '0' '0' '0'
 '1' '1' '1' '1' '1' '0' '1' '1' '0' '0' '0' '0' '1' '0' '1' '0' '0' '0'
 '0' '0' '1' '1' '0' '0' '0' '1' '1' '1' '1' '1' '1' '0' '1' '0' '1' '0'
 '0' '0' '1' '1' '0' '1' '0' '1' '0' '1' '0' '0' '1' '0' '1' '0' '1' '0'
 '0' '0' '0' '1' '1' '1' '0' '1' '1' '1' '0' '0' '1' '1' '1' '1' '1' '1'
 '1' '0' '1' '1' '0' '0' '1' '1' '1' '1' '1' '0' '0' '0' '1' '0' '0' '0'
 '0' '1' '1' '0' '1' '1' '0' '0' '0' '1' '1' '0' '1' '1' '1' '0' '1' '0'
 '0' '1' '1' '0' '1' '1' '0' '0' '1' '1' '0' '1' '1' '0' '0' '1' '0' '0'
 '1' '1' '0' '0' '1' '1' '0' '1' '0' '0' '1' '0' '1' '0' '1' '1' '1' '1'
 '1' '1' '0' '0' '1' '1' '0' '0' '1' '0' '0' '0' '0' '1' '0' '1' '0' '1'
 '1' '0' '0' '0' '0' '1' '1' '1' '1' '0' '1' '1' '1' '1' '0' '0' '1' '0'
 '1' '0' '0' '0' '1' '0' '0' '0' '1' '0' '1' '1' '1' '0' '1' '0' '0' '1'
 '0' '1' '0' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1

In [21]:
Y.shape

(869,)

In [22]:
# converting the textual data to feature vectors
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [23]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 167652 stored elements and shape (869, 17684)>
  Coords	Values
  (0, 110)	0.022169210642694944
  (0, 147)	0.02073731455746633
  (0, 249)	0.04782368380490262
  (0, 301)	0.023326036054781594
  (0, 372)	0.03467088657459486
  (0, 410)	0.033358547107006205
  (0, 437)	0.02774605628721979
  (0, 448)	0.05851489969266163
  (0, 497)	0.01679857039812936
  (0, 498)	0.02865250823385031
  (0, 616)	0.01679857039812936
  (0, 668)	0.06263074005601031
  (0, 698)	0.01833262455145468
  (0, 871)	0.13343418842802482
  (0, 941)	0.017522169843316955
  (0, 1098)	0.04782368380490262
  (0, 1196)	0.022322835453155873
  (0, 1338)	0.018419282157584264
  (0, 1458)	0.020495888868747557
  (0, 1510)	0.02131427890775214
  (0, 1580)	0.2194512821998493
  (0, 1588)	0.09564736760980524
  (0, 1598)	0.08326058170789516
  (0, 1644)	0.033358547107006205
  (0, 1875)	0.12252262795101658
  :	:
  (868, 16620)	0.009819296828899239
  (868, 16740)	0.023070462282112025
  (86