In [3]:
import pandas as pd

# Load both datasets
df_fake = pd.read_csv(r"E:\code\project\fakenews\Fake.csv")
df_true = pd.read_csv(r"E:\code\project\fakenews\True.csv")

# Add labels (1 = Fake, 0 = Real)
df_fake["label"] = 1
df_true["label"] = 0

# Combine datasets
df = pd.concat([df_fake, df_true], axis=0)

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display dataset structure
print(df.head())


                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      1  
1       April 5, 2017       0  
2  September 27, 2017       0  
3         May 22, 2017      1  
4       June 24, 2016       0  


In [4]:
import re
from sklearn.model_selection import train_test_split

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing
df["clean_text"] = (df["title"] + " " + df["text"]).apply(preprocess_text)

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"], df["label"], test_size=0.2, random_state=42)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Convert text into numerical vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naïve Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

# Evaluate Model
y_pred = nb_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Naïve Bayes Model Accuracy: {accuracy:.2f}")


Naïve Bayes Model Accuracy: 0.93


In [6]:
import joblib

joblib.dump(nb_model, "naive_bayes_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# Convert text into sequences
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=300)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=300)


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Create CNN model
cnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=50, input_length=300),
    Conv1D(filters=128, kernel_size=5, activation="relu"),
    GlobalMaxPooling1D(),
    Dense(10, activation="relu"),
    Dense(1, activation="sigmoid")  # Output: 0 (Real) or 1 (Fake)
])

cnn_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train CNN model
cnn_model.fit(X_train_seq, y_train, epochs=5, batch_size=32, validation_data=(X_test_seq, y_test))

# Save CNN Model
cnn_model.save("cnn_model.h5")
joblib.dump(tokenizer, "tokenizer.pkl")




Epoch 1/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 33ms/step - accuracy: 0.9255 - loss: 0.1825 - val_accuracy: 0.9918 - val_loss: 0.0251
Epoch 2/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 35ms/step - accuracy: 0.9975 - loss: 0.0098 - val_accuracy: 0.9931 - val_loss: 0.0213
Epoch 3/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 36ms/step - accuracy: 0.9996 - loss: 0.0025 - val_accuracy: 0.9947 - val_loss: 0.0203
Epoch 4/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 37ms/step - accuracy: 1.0000 - loss: 3.1483e-04 - val_accuracy: 0.9949 - val_loss: 0.0214
Epoch 5/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 34ms/step - accuracy: 0.9998 - loss: 9.1787e-04 - val_accuracy: 0.9942 - val_loss: 0.0263




['tokenizer.pkl']

In [9]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'X_train' contains your training news articles (processed text)
vectorizer = TfidfVectorizer(max_features=5000)  # Set appropriate max features
X_train_tfidf = vectorizer.fit_transform(X_train)  # Fit on training data

# **Save the vectorizer**
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("TF-IDF vectorizer saved as 'tfidf_vectorizer.pkl'")


TF-IDF vectorizer saved as 'tfidf_vectorizer.pkl'
