In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("fake.csv")

In [3]:
df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [8]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)        # remove links
    text = re.sub(r"[^a-z\s]", "", text)       # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()   # remove extra spaces
    return text

# Combine 'title' + 'text' and clean
df["clean"] = (df["title"].fillna("") + " " + df["text"].fillna("")).apply(clean_text)
print(df["clean"].head())


0    donald trump sends out embarrassing new years ...
1    drunk bragging trump staffer started russian c...
2    sheriff david clarke becomes an internet joke ...
3    trump is so obsessed he even has obamas name c...
4    pope francis just called out donald trump duri...
Name: clean, dtype: object


In [5]:
with open("news_corpus.txt", "w", encoding="utf-8") as f:
    for line in df["clean_text"].dropna():
        f.write(line + "\n")


In [6]:
from tokenizers import ByteLevelBPETokenizer
import os

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="news_corpus.txt", vocab_size=30000, min_frequency=3, special_tokens=["<PAD>", "<UNK>", "<CLS>", "<SEP>"])
os.makedirs("news_tokenizer", exist_ok=True)
tokenizer.save_model("news_tokenizer")

# Test
encoded = tokenizer.encode("Breaking: Government introduces new data policy.")
print(encoded.tokens)


['B', 're', 'aking', ':', 'Ġ', 'G', 'overnment', 'Ġintroduces', 'Ġnew', 'Ġdata', 'Ġpolicy', '.']


In [9]:
MAX_LEN = 200

def encode_text(text):
    ids = tokenizer.encode(text).ids[:MAX_LEN]
    pad_len = MAX_LEN - len(ids)
    return ids + [0]*pad_len

df["input_ids"] = df["clean"].apply(encode_text)
print(df["input_ids"].iloc[0][:20])


[2328, 355, 9893, 462, 4744, 460, 841, 7155, 2023, 379, 317, 4487, 617, 355, 503, 2513, 260, 4462, 453, 1010]


In [10]:
df.to_csv("cleaned_dataset.csv", index=False)


In [12]:
!pip install torch


Collecting torch
  Downloading torch-2.9.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.9.0-cp312-cp312-win_amd64.whl (109.3 MB)
   ---------------------------------------- 0.0/109.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/109.3 MB 2.0 MB/s eta 0:00:54
   ---------------------------------------- 0.2/109.3 MB 2.3 MB/s eta 0:00:48
   ---------------------------------------- 0.3/109.3 MB 2.3 MB/s eta 0:00:49
   ---------------------------------------- 0.3/109.3 MB 1.8 MB/s eta 0:01:02
   ---------------------------------------- 0.4/109.3 MB 2.1 MB/s eta 0:00:53
   ---------------------------------------- 0.6/109.3 MB 2.2 MB/s eta 0:00:50
   ---------------------------------------- 0.6/109.3 MB 2.3 MB/s eta 0:00:48
   ---------------------------------------- 0.6/109.3 MB 2.0 MB/s eta 0:00:54
   ---------------------------------------- 0.8/109.3 MB 1

In [16]:
import torch
from sklearn.model_selection import train_test_split

X = torch.tensor(df["input_ids"].tolist())
y = torch.tensor(df["label"].values)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: 'label'

In [15]:
print(df["clean"].head())
print(df["input_ids"].head())


0    donald trump sends out embarrassing new years ...
1    drunk bragging trump staffer started russian c...
2    sheriff david clarke becomes an internet joke ...
3    trump is so obsessed he even has obamas name c...
4    pope francis just called out donald trump duri...
Name: clean, dtype: object
0    [2328, 355, 9893, 462, 4744, 460, 841, 7155, 2...
1    [71, 28127, 8514, 355, 5673, 2289, 1121, 4452,...
2    [24998, 2384, 11176, 4997, 289, 2892, 4152, 32...
3    [975, 317, 511, 8997, 314, 654, 414, 7410, 173...
4    [83, 1091, 4028, 503, 1093, 462, 617, 355, 842...
Name: input_ids, dtype: object


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Vectorize your tokenized text as strings for TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

df["joined_tokens"] = df["clean"].astype(str)  # join tokenized list to string
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["joined_tokens"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.5084096231637215


In [18]:
import numpy as np

# Add a fake binary label column for testing the ML pipeline
df["label"] = np.random.randint(0, 2, size=len(df))

print(df[["clean", "label"]].head())


                                               clean  label
0  donald trump sends out embarrassing new years ...      0
1  drunk bragging trump staffer started russian c...      0
2  sheriff david clarke becomes an internet joke ...      1
3  trump is so obsessed he even has obamas name c...      0
4  pope francis just called out donald trump duri...      0


In [20]:
df.columns

Index(['title', 'text', 'subject', 'date', 'clean_text', 'clean', 'input_ids',
       'joined_tokens', 'label'],
      dtype='object')

In [21]:
df['label'].head()

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int32

In [22]:
df["tokenized"] = df["clean"].apply(lambda t: " ".join(tokenizer.encode(t).tokens))


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = vectorizer.fit_transform(df["tokenized"])


In [24]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=3, solver='lbfgs', max_iter=2000)


In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')


In [None]:
pred1 = logreg.predict_proba(X_test)[:,1]
pred2 = xgb.predict_proba(X_test)[:,1]
final_pred = (pred1 + pred2)/2


In [26]:
!pip install nltk xgboost --quiet
import pandas as pd, numpy as np, re, nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("fake.csv")
print(df.columns)
# id, title, author, text, label


Index(['title', 'text', 'subject', 'date'], dtype='object')


In [28]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = " ".join([w for w in text.split() if w not in stop_words])
    return text

df["clean"] = (df["title"].fillna("") + " " + df["text"].fillna("")).apply(clean_text)
print("Sample cleaned text:\n", df["clean"].iloc[0][:500])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shahr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Sample cleaned text:
 donald trump sends embarrassing new years eve message disturbing donald trump wish americans happy new year leave instead give shout enemies haters dishonest fake news media former reality show star one job country rapidly grows stronger smarter want wish friends supporters enemies haters even dishonest fake news media happy healthy new year president angry pants tweeted great year america country rapidly grows stronger smarter want wish friends supporters enemies haters even dishonest fake news


In [29]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer("news_tokenizer/vocab.json", "news_tokenizer/merges.txt")
df["clean"] = df["clean"].apply(lambda t: " ".join(tokenizer.encode(t).tokens))


In [32]:
import numpy as np
df["label"] = np.random.randint(0, 2, size=len(df))

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = vectorizer.fit_transform(df["clean"])
y = df["label"]


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [34]:
model = LogisticRegression(C=3, solver='lbfgs', max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.490738769427294
              precision    recall  f1-score   support

           0       0.49      0.50      0.50      2352
           1       0.49      0.48      0.49      2345

    accuracy                           0.49      4697
   macro avg       0.49      0.49      0.49      4697
weighted avg       0.49      0.49      0.49      4697



In [35]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

y_pred2 = xgb.predict(X_test)
print("XGB Accuracy:", accuracy_score(y_test, y_pred2))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGB Accuracy: 0.4992548435171386
