In [None]:
import pandas as pd
import numpy as np
import re
import string

# Text Processing
import nltk
from nltk.tokenize import word_tokenize, WhitespaceTokenizer, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [None]:
fake_df = pd.read_csv('/content/Fake.csv')
true_df = pd.read_csv('/content/True.csv')


 Add a label to both

In [None]:
fake_df['label'] = 0  # Fake news
true_df['label'] = 1  # Real news

# Combine both datasets
df = pd.concat([fake_df, true_df])
df = df[['text', 'label']]  # Use only text and label
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle rows


Preprocess the Text

In [None]:
def clean_text(text):
    text = text.lower()  # Normalize
    text = re.sub(r'\[.*?\]', '', text)  # Remove square brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation & numbers
    return text


In [None]:
df['clean_text'] = df['text'].apply(clean_text)


 Tokenization

In [None]:
# White space tokenizer
wst = WhitespaceTokenizer()
df['tokens_whitespace'] = df['clean_text'].apply(wst.tokenize)

# Regular expression tokenizer
regexp_tokenizer = RegexpTokenizer(r'\w+')
df['tokens_regexp'] = df['clean_text'].apply(regexp_tokenizer.tokenize)


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

Stopwords Removal

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['tokens_cleaned'] = df['tokens_regexp'].apply(remove_stopwords)


Stemming and Lemmatization

In [14]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Apply both
df['stemmed'] = df['tokens_cleaned'].apply(lambda tokens: [stemmer.stem(w) for w in tokens])
df['lemmatized'] = df['tokens_cleaned'].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])


 Convert Text to Numbers with TF-IDF

In [15]:
# Join tokens back into sentence
df['final_text'] = df['lemmatized'].apply(lambda x: ' '.join(x))

# Vectorize
tfidf = TfidfVectorizer(ngram_range=(1,2))  # Using n-grams (unigram + bigram)
X = tfidf.fit_transform(df['final_text'])
y = df['label']


Split Data into Training and Testing

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Logistic Regression Model (Supervised Learning)

In [17]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


Evaluation

In [18]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.987305122494432
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4731
           1       0.99      0.99      0.99      4249

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



Sigmoid Function (used inside logistic regression)

In [19]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

z = 0.5
print("Sigmoid of", z, "=", sigmoid(z))


Sigmoid of 0.5 = 0.6224593312018546


In [22]:
# Define preprocessing function
def preprocess_text(text):
    text = clean_text(text)
    tokens = regexp_tokenizer.tokenize(text)
    tokens = remove_stopwords(tokens)
    lemmatized = [lemmatizer.lemmatize(w) for w in tokens]
    final_text = ' '.join(lemmatized)
    return final_text

# Test news
test_news = "Breaking: Scientists discover a new planet in our solar system!"

# Predict
preprocessed = preprocess_text(test_news)
vectorized = tfidf.transform([preprocessed])
prediction = model.predict(vectorized)[0]

if prediction == 0:
    print("🟥 This news is most likely FAKE.")
else:
    print("✅ This news is most likely REAL.")


🟥 This news is most likely FAKE.
