In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('fake_news_cleaned_data.csv')
df.head()

Unnamed: 0,title,text,date,source,author,category,label,content,clean_content
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real,Foreign Democrat final. more tax development b...,foreign democrat final tax development store a...
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake,To offer down resource great point. probably g...,offer resource great point probably guess west...
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake,Himself church myself carry. them identify for...,church carry identify forward present success ...
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake,You unit its should. phone which item yard Rep...,unit phone item yard republican safe police id...
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake,Billion believe employee summer how. wonder my...,billion believe employee summer wonder fact di...


In [4]:
# Drop columns that are not used for model training (safe drop if they exist)
cols_to_drop = ['source', 'category', 'content', 'date', 'clean_content']
for c in cols_to_drop:
    if c in df.columns:
        df.drop(c, axis=1, inplace=True)
print('Dropped (if existed):', [c for c in cols_to_drop if c not in df.columns])

Dropped (if existed): ['source', 'category', 'content', 'date', 'clean_content']


In [5]:
df['author'].value_counts()

author
Michael Smith          12
John Smith             11
Christopher Johnson     9
Jennifer Davis          7
Michael Lee             7
                       ..
Andrew Stark            1
Samuel Gates            1
Kristen Buchanan        1
Deborah Leon            1
David Wise              1
Name: count, Length: 17051, dtype: int64

In [6]:
df.head()

Unnamed: 0,title,text,author,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,Paula George,real
1,To offer down resource great point.,probably guess western behind likely next inve...,Joseph Hill,fake
2,Himself church myself carry.,them identify forward present success risk sev...,Julia Robinson,fake
3,You unit its should.,phone which item yard Republican safe where po...,Mr. David Foster DDS,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,Austin Walker,fake


# TF-IDF: Turning text into numbers

We convert raw text into numeric features using TF-IDF so ML models can consume it.

- Term Frequency (TF): Counts how often a term appears in a document. For term `t` in document `d`, $\text{tf}(t,d)=\text{count}(t\text{ in }d)$ or a normalized variant.
- Inverse Document Frequency (IDF): Down-weights terms common across many documents. For corpus with `N` docs and `df(t)` docs containing `t`:
  $$\text{idf}(t)=\log\frac{N+1}{df(t)+1}+1$$
  (Sklearn uses smoothing `+1` to avoid division by zero.)
- TF-IDF: The product $\text{tfidf}(t,d)=\text{tf}(t,d)\times\text{idf}(t)$, highlighting terms frequent in a document but rare in the corpus.

Benefits:
- Reduces impact of stopwords (e.g., "the", "is").
- Produces a sparse numeric matrix suitable for linear models (LogReg, SVM) and others.

Below we build TF-IDF features from the dataset’s text column and inspect results.

In [None]:

import pandas as pd

df = pd.read_csv('fake_news_cleaned_data.csv')

# DROP UNWANTED COLUMNS
cols_to_drop = ['source', 'category', 'content', 'date', 'clean_content']
for c in cols_to_drop:
    if c in df.columns:
        df.drop(c, axis=1, inplace=True)

required = ['title', 'author', 'label']
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f'Missing required columns for training: {missing}')

# Combine title and author into a single text feature for TF-IDF 
X_text = (df['title'].fillna('').astype(str) + ' ' + df['author'].fillna('').astype(str)).astype(str)
y = df['label']

print('Using columns: title, author')
print('Label column: label')
print('Dataset size:', len(df))

Using columns: title, author
Label column: label
Dataset size: 20000


In [12]:

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import joblib
import numpy as np

#converting --> TEXT TO NUMERICAL FEATURES
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    ngram_range=(1, 2),      
    max_features=100_000,    
    min_df=2                 
)


X = vectorizer.fit_transform(X_text)
print('TF-IDF shape:', X.shape)  

out_dir = '.'
sparse.save_npz(out_dir + '/TF_matrix.npz', X)
with open(out_dir + '/TF_features.txt', 'w', encoding='utf-8') as f:
    for feat in vectorizer.get_feature_names_out():
        f.write(feat + '\n')

joblib.dump(vectorizer, out_dir + '/TF_vectorizer.joblib')
print('Saved: TF_matrix.npz, TF_features.txt, TF_vectorizer.joblib')

feature_names = np.array(vectorizer.get_feature_names_out())
print('Sample features:', feature_names[:20])

TF-IDF shape: (20000, 8460)
Saved: TF_matrix.npz, TF_features.txt, TF_vectorizer.joblib
Sample features: ['aaron' 'aaron alvarez' 'aaron jimenez' 'aaron rodriguez' 'aaron smith'
 'aaron stanley' 'aaron williams' 'abbott' 'abigail' 'ability'
 'ability garden' 'ability image' 'ability popular' 'ability range'
 'ability suggest' 'able' 'able decide' 'able john' 'able kyle' 'able lay']
Saved: TF_matrix.npz, TF_features.txt, TF_vectorizer.joblib
Sample features: ['aaron' 'aaron alvarez' 'aaron jimenez' 'aaron rodriguez' 'aaron smith'
 'aaron stanley' 'aaron williams' 'abbott' 'abigail' 'ability'
 'ability garden' 'ability image' 'ability popular' 'ability range'
 'ability suggest' 'able' 'able decide' 'able john' 'able kyle' 'able lay']


In [13]:

import numpy as np

terms_to_check = ['trump', 'breaking', 'hillary', 'election', 'covid']
existing_terms = [t for t in terms_to_check if t in vectorizer.vocabulary_]
if existing_terms:
    term_ids = [vectorizer.vocabulary_[t] for t in existing_terms]
    idfs = vectorizer.idf_[term_ids]
    print('IDF values:')
    for t, v in zip(existing_terms, idfs):
        print(f'  {t:>10s} : {v:.3f}')
else:
    print('None of the sample terms were in the vocabulary; try different words.')


first_idx = int(np.argmax((X != 0).sum(axis=1)))  
row = X[first_idx]
row_coo = row.tocoo()

pairs = sorted(zip(row_coo.col, row_coo.data), key=lambda x: x[1], reverse=True)[:15]
print('\nTop terms for a sample document:')
for col_id, score in pairs:
    print(f'  {feature_names[col_id]} : {score:.3f}')

IDF values:
    election : 6.194

Top terms for a sample document:
  hour society : 0.336
  join music : 0.336
  ms sarah : 0.336
  technology hour : 0.336
  rowe : 0.291
  ms : 0.265
  hour : 0.215
  thing : 0.212
  remain : 0.210
  society : 0.210
  join : 0.210
  music : 0.210
  technology : 0.209
  laugh : 0.207
  sarah : 0.201


In [10]:
# Train and evaluate Logistic Regression on TF-IDF features
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Use X and y from previous cells (X: TF-IDF matrix, y: label)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = LogisticRegression(max_iter=1000, solver='lbfgs')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f'Logistic Regression accuracy: {acc:.4f}')
print('Confusion matrix:')
print(cm)
print('Classification report:')
print(classification_report(y_test, y_pred))

Logistic Regression accuracy: 0.4945
Confusion matrix:
[[1048  963]
 [1059  930]]
Classification report:
              precision    recall  f1-score   support

        fake       0.50      0.52      0.51      2011
        real       0.49      0.47      0.48      1989

    accuracy                           0.49      4000
   macro avg       0.49      0.49      0.49      4000
weighted avg       0.49      0.49      0.49      4000

