In [22]:
! pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0



[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout,Bidirectional
from tensorflow.keras.optimizers import Adam

In [12]:
df = pd.read_csv("News_Sentiment_dataset.csv")

In [4]:
df.shape

(848, 5)

In [5]:
df.head()

Unnamed: 0,news_title,reddit_title,sentiment,text,url
0,Mark Cuban launches generic drug company,Billionaire Mark Cuban just launched a drug co...,1.0,Billionaire investor and Shark Tank star Mark ...,https://www.beckershospitalreview.com/pharmacy...
1,From Defendant to Defender: One Wrongfully Con...,"Man falsely imprisoned for 10 years, uses pris...",1.0,Attorney Jarrett Adams recently helped overtur...,https://www.nbcnews.com/news/us-news/defendant...
2,"Amazon Tribe Wins Lawsuit Against Big Oil, Sav...",Amazon tribe wins legal battle against oil com...,1.0,The Amazon Rainforest is well known across the...,https://www.disclose.tv/amazon-tribe-wins-laws...
3,Newark police: No officer fired a single shot ...,Newark police: No officer fired a single shot ...,1.0,Newark police: No officer fired a single shot ...,https://newjersey.news12.com/newark-police-no-...
4,Ingen barn døde i trafikken i 2019,No children died in traffic accidents in Norwa...,1.0,I 1970 døde det 560 mennesker i den norske tra...,https://www.nrk.no/trondelag/ingen-barn-dode-i...


In [7]:
df.isnull().sum()

news_title      0
reddit_title    0
sentiment       0
text            0
url             0
dtype: int64

In [10]:
df['sentiment'].value_counts()

sentiment
1.0    748
0.0    100
Name: count, dtype: int64

In [13]:
# Data Preprocessing
texts = df['text'].values  # News articles
labels = df['sentiment'].values  # Sentiment labels (0 or 1)

In [14]:
# Tokenize the text data
max_vocab_size = 10000  # The number of unique words to keep in the tokenizer
max_sequence_length = 300  # Maximum sequence length for padding
embedding_dim = 100  # Word embedding dimension

In [15]:
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)


In [16]:
# Pad sequences to ensure consistent input size
X = pad_sequences(sequences, maxlen=max_sequence_length)

In [17]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [18]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=128, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Sigmoid activation for binary classification




In [19]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Add EarlyStopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])


Epoch 1/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 544ms/step - accuracy: 0.8244 - loss: 0.6525 - val_accuracy: 0.8471 - val_loss: 0.5073
Epoch 2/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 488ms/step - accuracy: 0.8937 - loss: 0.3419 - val_accuracy: 0.8471 - val_loss: 0.4358
Epoch 3/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 506ms/step - accuracy: 0.8886 - loss: 0.3261 - val_accuracy: 0.8471 - val_loss: 0.4405
Epoch 4/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 505ms/step - accuracy: 0.9014 - loss: 0.2624 - val_accuracy: 0.8471 - val_loss: 0.4923
Epoch 5/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 471ms/step - accuracy: 0.9081 - loss: 0.1890 - val_accuracy: 0.8471 - val_loss: 0.5943


In [20]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step - accuracy: 0.8638 - loss: 0.4020
Test accuracy: 84.71%


In [28]:
# Define the Bidirectional LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=128)))  # Bidirectional LSTM
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Sigmoid activation for binary classification




In [29]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Add EarlyStopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")


Epoch 1/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 877ms/step - accuracy: 0.7685 - loss: 0.6233 - val_accuracy: 0.8471 - val_loss: 0.6701
Epoch 2/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 793ms/step - accuracy: 0.8876 - loss: 0.4089 - val_accuracy: 0.8471 - val_loss: 0.4311
Epoch 3/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 757ms/step - accuracy: 0.8954 - loss: 0.3277 - val_accuracy: 0.8471 - val_loss: 0.4484
Epoch 4/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 785ms/step - accuracy: 0.8801 - loss: 0.3286 - val_accuracy: 0.8471 - val_loss: 0.4301
Epoch 5/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 786ms/step - accuracy: 0.8819 - loss: 0.2822 - val_accuracy: 0.8471 - val_loss: 0.4602
Epoch 6/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 801ms/step - accuracy: 0.9021 - loss: 0.1879 - val_accuracy: 0.8412 - val_loss: 0.4786
Epoch 7/10
[1m11/11[0m [

In [41]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load dataset
df = pd.read_csv("News_Sentiment_dataset.csv")

# Data Preprocessing
texts = df['text'].values  # News articles
labels = df['sentiment'].values  # Sentiment labels (0 or 1)

# Tokenize the text data using a simple tokenizer (this will not use BERT's tokenizer)
tokenizer = Tokenizer(num_words=10000)  # Limit vocab size to 10,000 words
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to ensure uniform input size
max_sequence_length = 256  # You can adjust this value as needed
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Convert labels into TensorFlow tensors
train_labels = tf.convert_to_tensor(y_train)
test_labels = tf.convert_to_tensor(y_test)

# Define the deep LSTM model
def create_deep_lstm_model():
    model = Sequential()
    
    # Embedding layer
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
    
    # First LSTM layer with dropout
    model.add(LSTM(128, return_sequences=True))  # Return sequences for the next LSTM layer
    model.add(Dropout(0.2))
    
    # Second LSTM layer
    model.add(LSTM(64))  # No return_sequences, because it's the last LSTM layer
    model.add(Dropout(0.2))
    
    # Dense layer for binary classification (sigmoid activation for binary labels)
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(optimizer=Adam(learning_rate=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Create the model
model = create_deep_lstm_model()

# Train the model
model.fit(X_train, train_labels, epochs=5, batch_size=32, validation_data=(X_test, test_labels))

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, test_labels)
print(f"Test accuracy: {accuracy * 100:.2f}%")

# Make predictions (optional)
predictions = model.predict(X_test)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Compute accuracy score
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Final Accuracy score: {accuracy * 100:.2f}%")

# Save the trained model
model.save('deep_lstm_model.h5')




Epoch 1/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 217ms/step - accuracy: 0.6599 - loss: 0.6907 - val_accuracy: 0.8471 - val_loss: 0.6864
Epoch 2/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 186ms/step - accuracy: 0.8905 - loss: 0.6828 - val_accuracy: 0.8471 - val_loss: 0.6792
Epoch 3/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 202ms/step - accuracy: 0.9024 - loss: 0.6735 - val_accuracy: 0.8471 - val_loss: 0.6695
Epoch 4/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 186ms/step - accuracy: 0.8845 - loss: 0.6619 - val_accuracy: 0.8471 - val_loss: 0.6548
Epoch 5/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 196ms/step - accuracy: 0.8959 - loss: 0.6414 - val_accuracy: 0.8471 - val_loss: 0.6301
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.8638 - loss: 0.6267
Test accuracy: 84.71%
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 114ms



Final Accuracy score: 84.71%


In [43]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Load the saved deep LSTM model
model = load_model('deep_lstm_model.h5')

# Tokenizer used during training (you need to fit it on the training data during the training phase)
tokenizer = Tokenizer(num_words=10000)  # Same num_words parameter used during training

# Function to predict sentiment of a new news article
def predict_sentiment(news_text: str):
    # Tokenize the new input text
    sequences = tokenizer.texts_to_sequences([news_text])

    # Pad the sequences to the same length used during training
    max_sequence_length = 256  # Adjust this if necessary (should be the same as during training)
    padded_sequence = pad_sequences(sequences, maxlen=max_sequence_length)

    # Make prediction
    prediction = model.predict(padded_sequence)

    # Return the result (0 or 1)
    if prediction > 1:
        return "Positive"
    else:
        return "Negative"

# Example usage of the prediction function
new_article = "One killed after Tesla Cybertruck catches fire and explodes outside Trump’s Las Vegas hotel - PBS NewsHour"

sentiment = predict_sentiment(new_article)
print(f"The sentiment of the article is: {sentiment}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 856ms/step
The sentiment of the article is: Negative
