<a href="https://colab.research.google.com/github/SandeepKumar-05/FakeNewsDetectorAI/blob/main/FakeNewsDetectorAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# !pip install streamlit pyngrok transformers torch pillow pytesseract gensim joblib
# !apt-get install -y tesseract-ocr


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
fakedf = pd.read_csv("/content/drive/MyDrive/FakeNewsDetector/Fake.csv")
truedf = pd.read_csv("/content/drive/MyDrive/FakeNewsDetector/True.csv")

#labels
fakedf['label'] = 0
truedf['label'] = 1

#combain datasets
combined_df = pd.concat([fakedf, truedf])
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
combined_df.dropna(subset=['title', 'text'], inplace=True)
combined_df['full_text'] = combined_df['title'] + ". " + combined_df['text']


In [None]:
fakedf.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [None]:
truedf.head()


Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [None]:
combined_df.head()

Unnamed: 0,title,text,subject,date,label,full_text
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0,Ben Stein Calls Out 9th Circuit Court: Committ...
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1,Trump drops Steve Bannon from National Securit...
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1,Puerto Rico expects U.S. to lift Jones Act shi...
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0,OOPS: Trump Just Accidentally Confirmed He Le...
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1,Donald Trump heads for Scotland to reopen a go...


In [None]:

nltk.download('punkt_tab')
# Tokenize text
combined_df['tokens'] = combined_df['full_text'].apply(lambda x: word_tokenize(x.lower()))

# Train Word2Vec
w2v_model = Word2Vec(sentences=combined_df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Convert each text into vector
def get_vector(tokens, model, size):
    vec = np.zeros(size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    return vec / count if count > 0 else vec

combined_df['vectors'] = combined_df['tokens'].apply(lambda x: get_vector(x, w2v_model, 100))

X = np.vstack(combined_df['vectors'].values)
y = combined_df['label'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

preds = lr_model.predict(X_val)
acc = accuracy_score(y_val, preds)
print("🔍 Word2Vec + Logistic Regression Accuracy:", acc)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


🔍 Word2Vec + Logistic Regression Accuracy: 0.9910913140311804


In [None]:
import joblib

w2v_model.save("/content/drive/MyDrive/FakeNewsDetector/word2vec_fakenews.model")
joblib.dump(lr_model, "/content/drive/MyDrive/FakeNewsDetector/logreg_w2v_model.pkl")


['/content/drive/MyDrive/FakeNewsDetector/logreg_w2v_model.pkl']

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

df = combined_df[['full_text', 'label']].copy()
train_df, val_df = train_test_split(df.sample(frac=0.1, random_state=42), test_size=0.2, random_state=42)

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["full_text"], padding="max_length", truncation=True)

train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3592 [00:00<?, ? examples/s]

Map:   0%|          | 0/898 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    metric_for_best_model="accuracy",
    overwrite_output_dir=True,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mksandeep36723[0m ([33mksandeep36723-jyothi-engineering-college[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Step,Training Loss


{'eval_loss': 0.008576590567827225,
 'eval_accuracy': 0.9988864142538976,
 'eval_runtime': 934.8738,
 'eval_samples_per_second': 0.961,
 'eval_steps_per_second': 0.121,
 'epoch': 1.0}

In [None]:
model.save_pretrained("/content/drive/MyDrive/FakeNewsDetector/transformer_model")
tokenizer.save_pretrained("/content/drive/MyDrive/FakeNewsDetector/transformer_model")



('/content/drive/MyDrive/FakeNewsDetector/transformer_model/tokenizer_config.json',
 '/content/drive/MyDrive/FakeNewsDetector/transformer_model/special_tokens_map.json',
 '/content/drive/MyDrive/FakeNewsDetector/transformer_model/vocab.txt',
 '/content/drive/MyDrive/FakeNewsDetector/transformer_model/added_tokens.json',
 '/content/drive/MyDrive/FakeNewsDetector/transformer_model/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# ✅ Load the trained model
MODEL_PATH = "transformer_model"  # Change to your model path or use "distilbert-base-uncased"

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
except Exception as e:
    print(f"❌ Failed to load model/tokenizer: {e}")
    raise

# ✅ Prediction function
def predict_fake_news(text, model, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Tokenize input
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=256
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Inference
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    label = "FAKE" if prediction == 0 else "REAL"
    print(f"📰 Input: {text}\n🔍 Prediction: {label}")
    return label

# ✅ Example Predictions
predict_fake_news("Iran and Israel stoped the war", model, tokenizer)
predict_fake_news("Aliens will play cricket in Kerala tomorrow", model, tokenizer)


📰 Input: Iran and Israel stop the war
🔍 Prediction: REAL
📰 Input: Aliens will play cricket in Kerala tomorrow
🔍 Prediction: FAKE


'FAKE'