# LSTM

## Import the required libraries

In [23]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

## Construct paths using os.path.join

In [2]:
true_news_path = os.path.join('..', 'artifacts', 'preprocessed_true.csv')
fake_news_path = os.path.join('..', 'artifacts', 'preprocessed_fake.csv')

## Read the CSV files

In [3]:
true_news = pd.read_csv(true_news_path)
fake_news = pd.read_csv(fake_news_path)

## Add a label column to each dataframe

In [4]:
true_news['label'] = 1
fake_news['label'] = 0

## Combine the datasets

In [5]:
data = pd.concat([true_news, fake_news], ignore_index=True)

## Shuffle the data

In [6]:
data = data.sample(frac=1).reset_index(drop=True)

## Check for missing values in the 'text' column

In [7]:
data['text'].isnull().sum()

632

## Fill missing values with an empty string

In [8]:
data['text'].fillna('', inplace=True)

## Prepare text and labels

In [9]:
texts = data['text'].values
labels = data['label'].values

## Tokenize the text

In [10]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

## Pad sequences to ensure uniform input size

In [11]:
data_padded = pad_sequences(sequences, maxlen=500)

## Encode the labels

In [12]:
encoder = LabelEncoder()
labels_encoded = encoder.fit_transform(labels)

## Split the data into training (70%) and remaining (30%)

In [13]:
x_train, x_rem, y_train, y_rem = train_test_split(data_padded, labels_encoded, train_size=0.7, random_state=42)

## Split the remaining data into testing (20%) and validation (10%)

In [14]:
x_test, x_val, y_test, y_val = train_test_split(x_rem, y_rem, test_size=0.33, random_state=42)

## Define the model

In [15]:
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=500),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

## Compile the model

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## Print the model summary

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 128)          25906304  
                                                                 
 spatial_dropout1d (Spatial  (None, 500, 128)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 25998005 (99.17 MB)
Trainable params: 25998005 (99.17 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Train the model

In [18]:
history = model.fit(x_train, y_train, epochs=5, batch_size=64, validation_data=(x_val, y_val), verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluate the model

In [19]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9846


## Save the model

In [24]:
model.save('news_classification_model.keras')

## Load the model

In [25]:
from tensorflow.keras.models import load_model
model = load_model('news_classification_model.h5')

## Sample prediction

In [26]:
sample_text = ["Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that."]
sample_sequence = tokenizer.texts_to_sequences(sample_text)
sample_padded = pad_sequences(sample_sequence, maxlen=500)
prediction = model.predict(sample_padded)
print('Fake News' if prediction < 0.5 else 'True News')

Fake News
