In [9]:
# Import required libraries
import pandas as pd
import spacy
from tqdm import tqdm
import sys

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load CSV files
fake_df = pd.read_csv("fake1.csv")
real_df = pd.read_csv("true1.csv")

# Add labels
fake_df["label"] = 0  # Fake news
real_df["label"] = 1  # Real news

# Combine and shuffle datasets
df = pd.concat([fake_df, real_df], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

# Efficient cleaning function using tqdm (NO file=sys.stdout)
def clean_texts(texts):
    cleaned = []
    for doc in tqdm(nlp.pipe(texts, batch_size=50), total=len(texts), desc="🧹 Cleaning Text", leave=True):
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        cleaned.append(" ".join(tokens))
    return cleaned

# Clean title and text columns (shows progress)
print("Cleaning titles...")
df['clean_title'] = clean_texts(df['title'].astype(str))

print("Cleaning bodies...")
df['clean_text'] = clean_texts(df['text'].astype(str))

# Combine cleaned title and text
df['combined_text'] = df['clean_title'] + " " + df['clean_text']

# Preview the cleaned dataset
print("\n Preview of cleaned data:")
print(df[['title', 'combined_text', 'label']].head())

# Save to a new CSV file
df.to_csv("cleanedfakenews1", index=False)
print("\nCleaned dataset saved as 'cleanedfakenews1.csv'")

Cleaning titles...


🧹 Cleaning Text: 100%|██████████| 44898/44898 [05:32<00:00, 135.02it/s]


Cleaning bodies...


🧹 Cleaning Text: 100%|██████████| 44898/44898 [1:02:40<00:00, 11.94it/s]



 Preview of cleaned data:
                                               title  \
0  Moscow, Seoul closer on North Korea after thei...   
1  NIGHTMARE SCENARIO: FOX NEWS Reports Obama Can...   
2  Egypt says suspended U.S. military exercises t...   
3  North Carolina transgender bathroom law faces ...   
4   Mike Pence Gets A Special Gift In His Office ...   

                                       combined_text  label  
0  Moscow Seoul close North Korea leader meet RIA...      1  
1  NIGHTMARE SCENARIO FOX NEWS report Obama Appoi...      0  
2  Egypt say suspend U.S. military exercise resum...      1  
3  North Carolina transgender bathroom law face f...      1  
4    Mike Pence get Special Gift Office Mail Mont...      0  

Cleaned dataset saved as 'cleanedfakenews1.csv'


In [10]:
from sklearn.model_selection import train_test_split  # Import this

# Splitting the data
X1 = df['combined_text']  # Features (cleaned text)
Y1 = df['label']          # Labels (0 for fake, 1 for real)

X1_train, X1_test, Y1_train, Y1_test = train_test_split(
    X1, Y1, test_size=0.2, random_state=42
)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# Create a TF-IDF Vectorizer
vectorizer1 = TfidfVectorizer(max_features=5000)

# Fit and transform training data
X1_train_vec = vectorizer1.fit_transform(X1_train)

# Transform test data
X1_test_vec = vectorizer1.transform(X1_test)

In [13]:
# Check the shape of the vectors
print("TF-IDF shape for X_train:", X1_train_vec.shape)
print("TF-IDF shape for X_test:", X1_test_vec.shape)

TF-IDF shape for X_train: (35918, 5000)
TF-IDF shape for X_test: (8980, 5000)


In [14]:
import joblib
import pandas as pd
# Load cleaned data
df = pd.read_csv("cleanedfakenews1.csv")

# Save TF-IDF vectorizer
joblib.dump(vectorizer1, 'tfidf_vectorizer.pkl')

# Save data splits (optional but helpful)
joblib.dump(X1_train_vec, 'X1_train_vec.pkl')
joblib.dump(X1_test_vec, 'X1_test_vec.pkl')
joblib.dump(Y1_train, 'Y1_train.pkl')
joblib.dump(Y1_test, 'Y1_test.pkl')




['Y1_test.pkl']

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [16]:
#logistuc regression 
log_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_model.fit(X1_train_vec, Y1_train)
log_preds = log_model.predict(X1_test_vec)

print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(Y1_test, log_preds))
print("Confusion Matrix:\n", confusion_matrix(Y1_test, log_preds))
print("Classification Report:\n", classification_report(Y1_test, log_preds))



Logistic Regression Results:
Accuracy: 0.9865256124721603
Confusion Matrix:
 [[4628   68]
 [  53 4231]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4696
           1       0.98      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [17]:
#Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X1_train_vec, Y1_train)
nb_preds = nb_model.predict(X1_test_vec)

print("\nNaive Bayes Results:")
print("Accuracy:", accuracy_score(Y1_test, nb_preds))
print("Confusion Matrix:\n", confusion_matrix(Y1_test, nb_preds))
print("Classification Report:\n", classification_report(Y1_test, nb_preds))


Naive Bayes Results:
Accuracy: 0.9256124721603564
Confusion Matrix:
 [[4393  303]
 [ 365 3919]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93      4696
           1       0.93      0.91      0.92      4284

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980



In [18]:
import joblib
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd


# 2. Initialize XGBoost classifier
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# 3. Train
xgb_model.fit(X1_train_vec, Y1_train)

# 4. Predict
xgb_pred = xgb_model.predict(X1_test_vec)

# 5. Evaluate
print("XGBoost Accuracy:", accuracy_score(Y1_test, xgb_pred))

print("\nClassification Report:")
print(classification_report(Y1_test, xgb_pred, target_names=["Fake", "Real"]))

cm = confusion_matrix(Y1_test, xgb_pred, labels=[0,1])
print("\nConfusion Matrix (rows=true, cols=predicted):")
print(pd.DataFrame(cm,
                   index=["Actual Fake","Actual Real"],
                   columns=["Pred Fake","Pred Real"]))

# 6. Save trained model
joblib.dump(xgb_model, 'xgboost_model.pkl')
print("\nSaved XGBoost model as 'xgboost_model.pkl'")


XGBoost Accuracy: 0.9972160356347439

Classification Report:
              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00      4696
        Real       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980


Confusion Matrix (rows=true, cols=predicted):
             Pred Fake  Pred Real
Actual Fake       4681         15
Actual Real         10       4274

Saved XGBoost model as 'xgboost_model.pkl'


In [19]:
#LightGBM
lgb_model = lgb.LGBMClassifier(class_weight='balanced', random_state=42)
lgb_model.fit(X1_train_vec, Y1_train)
lgb_preds = lgb_model.predict(X1_test_vec)

print("\nLightGBM Results:")
print("Accuracy:", accuracy_score(Y1_test, lgb_preds))
print("Confusion Matrix:\n", confusion_matrix(Y1_test, lgb_preds))
print("Classification Report:\n", classification_report(Y1_test, lgb_preds))

[LightGBM] [Info] Number of positive: 17133, number of negative: 18785
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.472458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 737187
[LightGBM] [Info] Number of data points in the train set: 35918, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

LightGBM Results:
Accuracy: 0.9972160356347439
Confusion Matrix:
 [[4680   16]
 [   9 4275]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4696
           1       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980





In [20]:
joblib.dump(log_model, 'logistic_model.pkl')
joblib.dump(nb_model, 'naive_bayes_model.pkl')
joblib.dump(xgb_model, 'xgboost_model.pkl')
joblib.dump(lgb_model, 'lightgbm_model.pkl')


['lightgbm_model.pkl']

In [26]:
import joblib
import spacy

# Load spaCy model for preprocessing
nlp = spacy.load("en_core_web_sm")

# Load saved models
log_model = joblib.load("logistic_model.pkl")
nb_model = joblib.load("naive_bayes_model.pkl")
xgb_model = joblib.load("xgboost_model.pkl")
lgb_model = joblib.load("lightgbm_model.pkl")

# Load TF-IDF vectorizer
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Text preprocessing function
def clean_input(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# User input
user_title = input("Enter the news title: ")
user_text = input("Enter the news body: ")

# Combine and clean
combined_text = user_title + " " + user_text
cleaned_text = clean_input(combined_text)

# Vectorize
vectorized_input = vectorizer.transform([cleaned_text])

# Predictions
print("\n🧠 Predictions:")
print("Logistic Regression:", "Real" if log_model.predict(vectorized_input)[0] == 1 else "Fake")
print("Naive Bayes:", "Real" if nb_model.predict(vectorized_input)[0] == 1 else "Fake")
print("XBBOOST:", "Real" if xgb_model.predict(vectorized_input)[0] == 1 else "Fake")
print("LightGBM:", "Real" if lgb_model.predict(vectorized_input)[0] == 1 else "Fake")



🧠 Predictions:
Logistic Regression: Real
Naive Bayes: Real
XBBOOST: Fake
LightGBM: Fake


