In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import save_model
from sklearn.model_selection import train_test_split

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# --- Step 1: Load Datasets ---
def load_liar_dataset():
    column_names = [
        'id', 'label', 'statement', 'subject', 'speaker', 'speaker_job',
        'state_info', 'party', 'barely_true_counts', 'false_counts',
        'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'
    ]
    train_df = pd.read_csv('liar_dataset/train.tsv', sep='\t', header=None, names=column_names)
    test_df = pd.read_csv('liar_dataset/test.tsv', sep='\t', header=None, names=column_names)
    valid_df = pd.read_csv('liar_dataset/valid.tsv', sep='\t', header=None, names=column_names)
    df_liar = pd.concat([train_df, test_df, valid_df], ignore_index=True)
    return df_liar

def load_additional_news():
    fake_df = pd.read_csv('Fake.csv')
    true_df = pd.read_csv('True.csv')

    fake_df['label'] = 'false'
    true_df['label'] = 'true'

    fake_df['statement'] = fake_df['text']
    true_df['statement'] = true_df['text']

    fake_df = fake_df[['statement', 'label']]
    true_df = true_df[['statement', 'label']]

    combined_df = pd.concat([fake_df, true_df], ignore_index=True)
    return combined_df

def load_combined_dataset():
    liar_df = load_liar_dataset()
    additional_df = load_additional_news()

    liar_df = liar_df[['statement', 'label']]
    df = pd.concat([liar_df, additional_df], ignore_index=True)
    return df

def convert_labels_to_binary(df):
    fake_labels = ['false', 'pants-fire', 'barely-true']
    real_labels = ['true', 'mostly-true', 'half-true']
    df = df[df['label'].isin(fake_labels + real_labels)]
    df['binary_label'] = df['label'].apply(lambda x: 0 if x in fake_labels else 1)
    return df

# --- Step 2: Preprocess Text ---
def preprocess_text(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(stemmed_tokens)

# --- Step 3: Load & Preprocess Data ---
print("Loading and preprocessing data...")
df = load_combined_dataset()
df = convert_labels_to_binary(df)
df['preprocessed_text'] = df['statement'].apply(preprocess_text)

# Drop empty texts after preprocessing
df = df[df['preprocessed_text'].str.strip() != '']

# --- Step 4: Tokenization ---
MAX_FEATURES = 10000
MAXLEN = 200

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(df['preprocessed_text'])

sequences = tokenizer.texts_to_sequences(df['preprocessed_text'])
padded = pad_sequences(sequences, maxlen=MAXLEN)

labels = df['binary_label'].values

# --- Step 5: Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# --- Step 6: Define LSTM Model ---
model = Sequential([
    Embedding(MAX_FEATURES, 128, input_length=MAXLEN),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# --- Step 7: Train ---
print("Training model...")
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1, verbose=1)

# --- Step 8: Save Model & Tokenizer ---
print("Saving model and tokenizer...")
model.save("lstm_model.h5")

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("✅ Model and tokenizer saved!")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Loading and preprocessing data...


FileNotFoundError: [Errno 2] No such file or directory: 'liar_dataset/train.tsv'

In [3]:
from google.colab import files
files.download("lstm_model.h5")
files.download("tokenizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>