In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib

# Download NLTK resources
nltk.download(['punkt', 'punkt_tab', 'stopwords', 'wordnet'])

# Load data with error handling
def load_data():
    try:
        fake = pd.read_csv('fake.csv')
        real = pd.read_csv('true.csv')
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please ensure 'fake.csv' and 'real.csv' exist in your working directory")
        raise
    
    fake['label'] = 1
    real['label'] = 0
    return pd.concat([fake, real]).sample(frac=1).reset_index(drop=True)

# Text preprocessing
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = nltk.word_tokenize(text)
    return ' '.join([
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word not in stop_words and len(word) > 2
    ])

# Main execution
if _name_ == "_main_":
    df = load_data()
    df['clean_text'] = df['text'].apply(preprocess_text)
    



# Feature extraction
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Save vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Train Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
nb_preds = nb.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds))
joblib.dump(nb, 'nb_model.pkl')

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_tfidf, y_train)
rf_preds = rf.predict(X_test_tfidf)
print("\nRandom Forest Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))
joblib.dump(rf, 'rf_model.pkl')

# LSTM Preparation
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=200)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=200)

# Build LSTM model
lstm_model = Sequential([
    Embedding(5000, 128, input_length=200),
    LSTM(128),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_seq, y_train, epochs=3, validation_data=(X_test_seq, y_test))
lstm_model.save('lstm_model.h5')

# Save tokenizer
joblib.dump(tokenizer, 'lstm_tokenizer.pkl')