In [45]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords # the for on in with these should be removed
from nltk.stem.porter import PorterStemmer # played playing == play
from sklearn.feature_extraction.text import TfidfVectorizer # played ==[0.0]
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [26]:
news_df = pd.read_csv('train.csv')

In [27]:
news_df.head()

Unnamed: 0,text;label
0,dark agenda behind globalism open border altma...
1,america poor still get shaft sami jamil jadall...
2,number accuser grow former miss finland accuse...
3,heroic prego advertisement replaces refresh we...
4,russia syria debbie reynolds thursday even bri...


In [28]:
#About the Dataset:

#text: Contains the content of the news article (or a portion of it).
#label: A binary label indicating whether the news article is real or fake.
#1: Fake News
#0: Real News


In [29]:
#preprocessing
news_df.isnull().sum() #to check null values

text;label    0
dtype: int64

In [30]:
news_df.shape  #to get the rows and columns

(16646, 1)

In [35]:
news_df = news_df.fillna(' ') #to fill a space in the null value places

In [32]:
news_df.isnull().sum()

text;label    0
dtype: int64

In [33]:
news_df['content'] = news_df['text']
#The error KeyError: 'text' means that the column name 'text' does not exist in your DataFrame.This likely happened because your dataset was not properly split into columns due to an incorrect delimiter.

KeyError: 'text'

In [3]:
print(news_df.columns)
#check the actual column names in dataFrame.

Index(['text;label'], dtype='object')


In [4]:
import pandas as pd

news_df = pd.read_csv("train.csv", delimiter=";")  # Use correct delimiter
print(news_df.head())  # Check if columns are separated properly


                                                text  label
0  dark agenda behind globalism open border altma...      0
1  america poor still get shaft sami jamil jadall...      0
2  number accuser grow former miss finland accuse...      0
3  heroic prego advertisement replaces refresh we...      0
4  russia syria debbie reynolds thursday even bri...      1


In [5]:
news_df.shape   #to get the rows and columns

(16646, 2)

In [6]:
news_df = news_df.fillna(' ')  #to fill a space in the null value places

In [7]:
news_df.isnull().sum() #to check null values

text     0
label    0
dtype: int64

In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
print("NLTK packages downloaded successfully!")

NLTK packages downloaded successfully!


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\noori\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noori\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
!pip install swifter



In [6]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import swifter  # Install using: pip install swifter

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset with proper delimiter handling
news_df = pd.read_csv("train.csv", delimiter=";", engine="python")

# Check and fix column names if needed
if "text;label" in news_df.columns:
    news_df[['text', 'label']] = news_df['text;label'].str.split(';', expand=True)
    news_df.drop(columns=['text;label'], inplace=True)

# Verify if 'text' column exists now
if 'text' not in news_df.columns:
    raise KeyError("The 'text' column is missing after processing. Check the dataset format.")

# Load stopwords once (for efficiency)
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):  # Handle non-string values
        return text
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]  # Use preloaded stopwords
    return ' '.join(tokens)

# Apply preprocessing using swifter (parallel processing)
news_df['text'] = news_df['text'][:1000].swifter.apply(preprocess_text)  # Process first 1000 rows for testing

# Save cleaned data
news_df.to_csv("cleaned_train.csv", index=False)

print("Preprocessing completed and saved to 'cleaned_train.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noori\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\noori\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Preprocessing completed and saved to 'cleaned_train.csv'


In [7]:
#separating the data & label
X = news_df.drop('label',axis=1)
y = news_df['label']

In [8]:
print(X)

                                                    text
0      dark agenda behind globalism open border altma...
1      america poor still get shaft sami jamil jadall...
2      number accuser grow former miss finland accuse...
3      heroic prego advertisement replaces refresh we...
4      russia syria debbie reynolds thursday even bri...
...                                                  ...
16641                                                NaN
16642                                                NaN
16643                                                NaN
16644                                                NaN
16645                                                NaN

[16646 rows x 1 columns]


In [9]:
"""
Stemming:
Stemming is the process of reducing a word to its Root word

example: hung hanged hanging ======hang

Steps:
lower case
splitting
removing stopwords
stemming
"""



In [10]:
# Initialize stemmer and stopwords once
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Define stemming function with error handling
def stemming(text):
    if not isinstance(text, str):  # Ensure input is a string
        return ""  # Return empty string for non-string values
    text = re.sub('[^a-zA-Z]', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    words = text.split()  # Tokenize
    words = [ps.stem(word) for word in words if word not in stop_words]  # Stemming + remove stopwords
    return ' '.join(words)  # Rejoin words

# Load dataset
news_df = pd.read_csv("train.csv", delimiter=";", engine="python")

# Check and fix column names if needed
if "text;label" in news_df.columns:
    news_df[['text', 'label']] = news_df['text;label'].str.split(';', expand=True)
    news_df.drop(columns=['text;label'], inplace=True)

# Apply stemming to the 'text' column
news_df['text'] = news_df['text'].astype(str).apply(stemming)

# Save cleaned data
news_df.to_csv("cleaned_train.csv", index=False)
print("Preprocessing completed and saved to 'cleaned_train.csv'")

Preprocessing completed and saved to 'cleaned_train.csv'


In [32]:
news_df['text'] = news_df['text'].apply(stemming)

In [34]:
news_df['text']

0        dark agenda behind global open border altmarke...
1        america poor still get shaft sami jamil jadall...
2        number accu grow former miss finland accu trum...
3        heroic prego adverti replac refresh webpag pre...
4        russia syria debbi reynold thursday even brief...
                               ...                        
16641    comment cost selfdriv taxi realli sixtyseven c...
16642    interest dutert get billion china get money ja...
16643    forget encyclopaedia get pokiespedia home win ...
16644    u elect race huma abedin connect minut video n...
16645    shame obama legaci white man beat viciou vote ...
Name: text, Length: 16646, dtype: object

In [35]:
#separating the data and label
# Extract features and labels
X = news_df['text'].values  # Use 'text' instead of 'content'
y = news_df['label'].values  


In [36]:
#converting the textual data to numerical data

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(X)


In [37]:
print(X)

  (0, 34015)	0.03284309433004357
  (0, 16687)	0.00763985160811272
  (0, 22809)	0.02379168166533704
  (0, 15811)	0.015835621663781812
  (0, 47979)	0.010269070425615813
  (0, 93127)	0.011356457762665181
  (0, 94344)	0.016309199992176608
  (0, 38458)	0.02755487311399001
  (0, 17517)	0.01754530891487748
  (0, 65719)	0.011165687668851725
  (0, 47431)	0.014574144978956356
  (0, 81568)	0.01202024373179219
  (0, 46282)	0.008255268779848687
  (0, 72875)	0.043559541240720465
  (0, 88922)	0.02834437949254646
  (0, 18172)	0.012451129209672299
  (0, 21438)	0.02922817287661162
  (0, 86724)	0.023957795592866107
  (0, 72825)	0.02484021985754806
  (0, 20573)	0.014274461994284891
  (0, 93063)	0.016158813253991402
  (0, 26453)	0.010703270880873705
  (0, 1192)	0.014675049654489156
  (0, 6363)	0.008884110893008645
  (0, 32508)	0.007088798489671592
  :	:
  (16645, 95818)	0.055347865957992154
  (16645, 22710)	0.04171271971445988
  (16645, 87026)	0.17802894211521525
  (16645, 77378)	0.05983198029917312
  (166

In [39]:
#Splitting the dataset to training & test data

In [42]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)


In [43]:
X_train.shape

(13316, 96715)

In [None]:
#Training the Model: Logistic Regression

In [46]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [47]:
# on training set
train_y_pred = model.predict(X_train)
print(accuracy_score(train_y_pred,Y_train))

0.9731901471913488


In [48]:
# on testing set
testing_y_pred = model.predict(X_test)
print(accuracy_score(testing_y_pred,Y_test))

0.9453453453453453


In [49]:
#Detection System

In [50]:
input_data = X_test[10]
prediction = model.predict(input_data)

In [51]:
if prediction[0] == 0:
    print('The News Is Real')
else:
    print('The News is Fake')

The News Is Real


In [54]:
news_df['text'][1]

'america poor still get shaft sami jamil jadallah novemb day ugli distast presidenti elect u histori see candid presid u presid elect grope email make u laugh stock world saw campaign littl anyth prioriti realli matter american peopl candid talk bring back job industri manufactur job know well liar never bring back kind industri manufactur job give middleclass american sen digniti secur two candid best cater rich power peopl money billionair alway shape nation agenda polit billion pour year nation elect differ previou nationalpresidenti elect close billion inject invest elect return donor never voter america poor black hispan white well neglect long time get shaft everywh everyon especi congress empow corpor america screw poor everi turn everi way congress legisl give incent corpor locat manufactur job oversea allow bank financ institut especi credit card compani give shaft poor futur hold next gener surpri great countri stagger number poor million popul prospect american part categori

In [None]:
#ANOTHER METHOD 