In [70]:
# Importing the dependencies
import numpy as np
import pandas as pd
import re	#regular expressions library
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [71]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
print(stopwords.words("english"))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [73]:
#Data Pre-processing
data = pd.read_csv("Datasets\\Fake_News_dataset.csv")

In [74]:
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


0 = Real News

1 = Fake News

In [75]:
data.shape

(20800, 5)

In [76]:
#checking for imbalanced class
data["label"].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

In [77]:
#checking for missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [78]:
data.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [79]:
#replacing the missing values with null strings
data = data.fillna("")

In [80]:
data.isna().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [81]:
# merge the author name and news title
data["content"] = data["author"] + data["title"]

In [82]:
data.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell LucusHouse Dem Aide: We Didn’t Even Se...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. FlynnFLYNN: Hillary Clinton, Big Wom..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.comWhy the Truth Might Get You ...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss15 Civilians Killed In Single U...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard PortnoyIranian woman jailed for fiction...


In [83]:
# seperating the feature and target
X = data.drop(columns="label", axis=1)	#Fropping a column
Y = data["label"]

In [84]:
print(X)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [85]:
print(Y)

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64


### Stemming:

- Stemming is the process of reducing a word to its rootword


### **📌 What is Lemmatization?**  
Lemmatization is the process of **reducing a word to its dictionary (base) form**, known as the **lemma**, while considering the context and meaning of the word. Unlike stemming, it produces **real words**.

✅ **Example:**  
| Word | Lemmatized Form |
|-------|----------------|
| Running | Run |
| Studies | Study |
| Better | Good |

🔹 **Uses WordNet Lemmatizer from NLTK**  
🔹 **Requires POS tagging** (e.g., verb vs. noun)  

📌 **Example in Python:**  
```python
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running", pos="v"))  # 'run'
print(lemmatizer.lemmatize("better", pos="a"))  # 'good'
```

---

### **📌 What is PorterStemmer?**  
PorterStemmer is a **rule-based stemming algorithm** that **removes suffixes** to reduce words to their root form, often resulting in **non-dictionary words**.

✅ **Example:**  
| Word | Stemmed Form |
|-------|--------------|
| Running | Run |
| Studies | Studi |
| Better | Better |

🔹 **Faster than lemmatization**  
🔹 **May not always return real words**  

📌 **Example in Python:**  
```python
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem("running"))  # 'run'
print(stemmer.stem("studies"))  # 'studi'
```

---

### **📌 Difference Between Lemmatization & Stemming**
| Feature         | Lemmatization | Stemming |
|---------------|---------------|----------|
| **Output** | Real words (dictionary form) | Root form (may not be real words) |
| **Accuracy** | High (uses linguistic rules) | Lower (rule-based, no context) |
| **Speed** | Slower (context-based) | Faster (just removes suffixes) |

### **📌 Which One to Use?**  
- ✅ **Use Lemmatization** for **accuracy & NLP tasks** where words need proper meaning.  
- ✅ **Use Stemming** for **fast & simple applications** like search engines.

Would you like me to rewrite your `stemming` function to use **lemmatization** instead? 🚀

In [86]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
	stemmed_content = re.sub("[^a-zA-Z]", ' ', content)
	"""
	🔹	Removes all non-alphabetic characters (digits, punctuation, 
		special characters) using regular expressions (re.sub).
	🔹 Replaces them with a space (' ') to avoid merging words together.
	"""
	stemmed_content = stemmed_content.lower()
	stemmed_content = stemmed_content.split()
	stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words("english")]
	stemmed_content = " ".join(stemmed_content)
	return stemmed_content

In [88]:
data["content"] = data["content"].apply(stemming)

In [89]:
print(data["content"])

0        darrel lucushous dem aid even see comey letter...
1        daniel j flynnflynn hillari clinton big woman ...
2                consortiumnew comwhi truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoyiranian woman jail fiction unpub...
                               ...                        
20795    jerom hudsonrapp trump poster child white supr...
20796    benjamin hoffmann f l playoff schedul matchup ...
20797    michael j de la merc rachel abramsmaci said re...
20798    alex ansarynato russia hold parallel exercis b...
20799                        david swansonwhat keep f aliv
Name: content, Length: 20800, dtype: object


In [90]:
X = data["content"].values
Y = data["label"].values

In [91]:
print(X)

['darrel lucushous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynnflynn hillari clinton big woman campu breitbart'
 'consortiumnew comwhi truth might get fire' ...
 'michael j de la merc rachel abramsmaci said receiv takeov approach hudson bay new york time'
 'alex ansarynato russia hold parallel exercis balkan'
 'david swansonwhat keep f aliv']


In [92]:
print(data["content"])

0        darrel lucushous dem aid even see comey letter...
1        daniel j flynnflynn hillari clinton big woman ...
2                consortiumnew comwhi truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoyiranian woman jail fiction unpub...
                               ...                        
20795    jerom hudsonrapp trump poster child white supr...
20796    benjamin hoffmann f l playoff schedul matchup ...
20797    michael j de la merc rachel abramsmaci said re...
20798    alex ansarynato russia hold parallel exercis b...
20799                        david swansonwhat keep f aliv
Name: content, Length: 20800, dtype: object


In [93]:
print(Y)

[1 0 1 ... 0 1 1]


In [94]:
print(data["label"])

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64


In [95]:
X.shape

(20800,)

In [96]:
Y.shape

(20800,)

### **📌 What is `.fit()` in Machine Learning?**  
The `.fit()` method **trains a model or a transformer on data** by learning patterns, parameters, or statistical properties.

---

### **📌 Use Cases of `.fit()`**
1. **For ML Models (Supervised Learning)**
   - Trains a model on the dataset (features & labels).  
   ```python
   model.fit(X_train, y_train)
   ```
   - Here, `model` (like `LogisticRegression()`) **learns patterns** from `X_train` to predict `y_train`.

2. **For Data Transformers (Standardization, Normalization, etc.)**
   - Learns **scaling parameters** (mean & standard deviation) from training data.
   ```python
   scaler = StandardScaler()
   scaler.fit(X_train)  # Learns mean & std deviation
   ```
   - After `.fit()`, we apply transformation using `.transform()`.  
   ```python
   X_scaled = scaler.transform(X_train)
   ```

3. **For NLP (Vectorization)**
   - Learns vocabulary from text data.
   ```python
   vectorizer = CountVectorizer()
   vectorizer.fit(text_data)
   ```

---

### **📌 `.fit()` vs `.fit_transform()` vs `.transform()`**
| Method | What It Does? |
|--------|--------------|
| `.fit(X)` | Learns parameters (e.g., mean & std in `StandardScaler()`) |
| `.transform(X)` | Applies the learned transformation to `X` |
| `.fit_transform(X)` | Combines both `.fit()` & `.transform()` in one step |

✅ **Use `.fit()` when you need to learn parameters first**  
✅ **Use `.fit_transform()` when you want to fit & transform in one step**

Let me know if you need an example! 🚀

In [97]:
#converting the textual data to feature vectors
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [98]:
print(X)

  (0, 578)	0.2694167078545384
  (0, 4211)	0.36253203231506576
  (0, 5006)	0.24725958235728157
  (0, 5969)	0.35488202138141456
  (0, 6273)	0.2839932825877812
  (0, 8022)	0.2313366174248873
  (0, 12782)	0.24619727512767192
  (0, 14555)	0.2917725968420029
  (0, 15019)	0.4300622675963931
  (0, 22724)	0.25523360180691607
  (0, 26340)	0.2808837940159642
  (1, 2622)	0.3562953366945267
  (1, 3281)	0.18652439327549428
  (1, 3859)	0.45980466668763476
  (1, 4767)	0.23338756776626793
  (1, 5916)	0.31810058109638056
  (1, 8772)	0.5258635625386451
  (1, 11313)	0.24166773097712638
  (1, 27923)	0.36911845953845024
  (2, 5121)	0.5511414848555652
  (2, 5240)	0.40440534260277944
  (2, 8567)	0.3411947414020896
  (2, 9454)	0.30743020569262086
  (2, 16361)	0.43295215406038445
  (2, 26235)	0.3665032495181434
  :	:
  (20797, 1249)	0.3072223353708335
  (20797, 2257)	0.3357782642976524
  (20797, 6088)	0.21253094503918346
  (20797, 11692)	0.2992170910232368
  (20797, 14104)	0.22761807337911874
  (20797, 16217)	0