<a href="https://colab.research.google.com/github/SreeSajeev/FakeNewsDetector/blob/main/fakenewsdetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
df = pd.read_csv(r'C:\Users\DELL\Desktop\data science\python_project\Fake-News-Detector\FakeNewsDetector\news.csv')


In [None]:
print(df.head())
print(df.info())


   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [None]:

# Drop Unnamed column if present
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

In [None]:
# Merge title + text into full_text
df['full_text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')

In [None]:

# Preview
print(df[['title', 'text', 'label']].head())

                                               title  \
0                       You Can Smell Hillary’s Fear   
1  Watch The Exact Moment Paul Ryan Committed Pol...   
2        Kerry to go to Paris in gesture of sympathy   
3  Bernie supporters on Twitter erupt in anger ag...   
4   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  


In [None]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['full_text'].apply(clean_text)


In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['clean_text'] = df['clean_text'].apply(remove_stopwords)


In [None]:
# If not already binary: 'FAKE' → 0, 'REAL' → 1
df['label'] = df['label'].map({'FAKE': 0, 'REAL': 1})


In [None]:
print(df[['clean_text', 'label']].head())
print(df['label'].value_counts())  # Check balance


                                          clean_text  label
0  smell hillarys fear daniel greenfield shillman...      0
1  watch exact moment paul ryan committed politic...      0
2  kerry go paris gesture sympathy us secretary s...      1
3  bernie supporters twitter erupt anger dnc trie...      0
4  battle new york primary matters primary day ne...      1
label
1    3171
0    3164
Name: count, dtype: int64


In [None]:
#Train a baseline model using TF-IDF + Logistic Regression
#Train-Test Split
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
#TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
#Train a Classifier (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9194948697711128
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92       633
           1       0.94      0.90      0.92       634

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [None]:
## Get top contributing words
feature_names = tfidf.get_feature_names_out()
coefficients = model.coef_[0]

top_fake = np.argsort(coefficients)[-10:]
top_real = np.argsort(coefficients)[:10]

print("🔴 Top 'FAKE' indicators:")
for i in top_fake:
    print(f"{feature_names[i]}: {coefficients[i]:.4f}")

print("\n🟢 Top 'REAL' indicators:")
for i in top_real:
    print(f"{feature_names[i]}: {coefficients[i]:.4f}")


🔴 Top 'FAKE' indicators:
fox: 2.0629
islamic: 2.1180
state: 2.1211
debate: 2.2950
president: 2.3083
sanders: 2.4135
candidates: 2.5277
cruz: 2.6956
gop: 2.7550
said: 6.5626

🟢 Top 'REAL' indicators:
october: -4.5693
hillary: -3.8679
election: -2.8209
november: -2.8071
article: -2.4623
share: -2.4381
source: -2.2480
fbi: -2.2260
russia: -2.2145
wikileaks: -1.9598


In [None]:
#Create a Prediction + Explanation Function
import numpy as np

def explain_prediction(text, model, vectorizer, top_n=10):
    # Vectorize the input text
    vec = vectorizer.transform([text])
    prediction = model.predict(vec)[0]
    proba = model.predict_proba(vec)[0]

    # Get feature importance
    feature_names = vectorizer.get_feature_names_out()
    coefs = model.coef_[0]

    # Multiply TF-IDF weights by model coefficients
    scores = vec.toarray()[0] * coefs
    word_score_pairs = [
        (feature_names[i], scores[i])
        for i in range(len(scores)) if scores[i] != 0
    ]

    word_score_pairs.sort(key=lambda x: abs(x[1]), reverse=True)
    top_words = word_score_pairs[:top_n]

    return {
        'prediction': prediction,
        'confidence': round(np.max(proba), 2),
        'top_words': top_words
    }


In [None]:
sample_text = "Hillary Clinton was under FBI investigation during the elections."
result = explain_prediction(sample_text, model, tfidf)

print(f"Prediction: {result['prediction']} ({result['confidence'] * 100:.2f}%)")
print("Top Influential Words:")
for word, score in result['top_words']:
    print(f"{word}: {score:.4f}")


Prediction: 0 (99.00%)
Top Influential Words:
hillary: -1.2322
fbi: -1.1663
investigation: -0.4663
elections: -0.3324
clinton: 0.0909


#Dynamic Misinformation Narrative Tracking & Visualization

In [None]:
# Install all required libraries
!pip install pandas numpy spacy scikit-learn plotly pyvis \
            sentence-transformers yake networkx \
            transformers shap matplotlib

# Download spaCy English model
!python -m spacy download en_core_web_sm


Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0 kB)
Collecting jellyfish (from yake)
  Downloading jellyfish-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
