In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc
import seaborn as sns

In [6]:
# Class to handle data preprocessing
class DataPreprocessing:
    def __init__(self, max_words=10000, max_len=100):
        self.max_words = max_words
        self.max_len = max_len
        self.tokenizer = Tokenizer(num_words=self.max_words)
    
    def load_data(self, filepath):
        # Load the Sentiment140 dataset (you can replace with your own dataset path)
        df = pd.read_csv(filepath, encoding='latin-1', header=None)
        df.columns = ['target', 'id', 'date', 'query', 'user', 'text']
        df = df[['target', 'text']]
        df['target'] = df['target'].apply(lambda x: 1 if x == 4 else 0)  # 4 -> positive, 0 -> negative
        return df
    
    def preprocess_text(self, df):
        # Clean text (optional: implement more cleaning steps if needed)
        df['text'] = df['text'].apply(lambda x: x.lower())
        return df
    
    def tokenize_text(self, df):
        # Tokenize the text data
        self.tokenizer.fit_on_texts(df['text'])
        sequences = self.tokenizer.texts_to_sequences(df['text'])
        padded_sequences = pad_sequences(sequences, maxlen=self.max_len)
        return padded_sequences
    
    def encode_labels(self, df):
        # Encode labels (0 or 1)
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(df['target'])
        return labels


In [7]:
# Class to handle the deep learning model
class SentimentAnalysisModel:
    def __init__(self, max_words=10000, max_len=100, embedding_dim=100, dropout_rate=0.2):
        self.max_words = max_words
        self.max_len = max_len
        self.embedding_dim = embedding_dim
        self.dropout_rate = dropout_rate
        self.model = self.build_model()
    
    def build_model(self):
        model = Sequential()
        model.add(Embedding(input_dim=self.max_words, output_dim=self.embedding_dim, input_length=self.max_len))
        model.add(LSTM(128, dropout=self.dropout_rate, recurrent_dropout=self.dropout_rate))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
    
    def train(self, X_train, y_train, X_val, y_val, batch_size=64, epochs=5):
        early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
        history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, 
                                 validation_data=(X_val, y_val), callbacks=[early_stopping])
        return history
    
    def evaluate(self, X_test, y_test):
        loss, accuracy = self.model.evaluate(X_test, y_test)
        return loss, accuracy
    
    def predict(self, X_input):
        return self.model.predict(X_input)

In [8]:
# Main workflow
if __name__ == "__main__":
    # Initialize Data Preprocessing and Model
    data_processor = DataPreprocessing()
    sentiment_model = SentimentAnalysisModel()

    # Load and preprocess data
    df = data_processor.load_data('sentiment400/training.1600000.processed.noemoticon.csv')
    df = data_processor.preprocess_text(df)
    
    # Prepare features and labels
    X = data_processor.tokenize_text(df)
    y = data_processor.encode_labels(df)

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    sentiment_model.train(X_train, y_train, X_test, y_test)

    # Evaluate the model
    loss, accuracy = sentiment_model.evaluate(X_test, y_test)
    print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

    # Example prediction
    sample_text = ["I love this product!", "This is the worst experience ever."]
    sample_seq = data_processor.tokenize_text(pd.DataFrame(sample_text, columns=['text']))
    predictions = sentiment_model.predict(sample_seq)
    print(f"Predictions: {predictions}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.37784940004348755, Test Accuracy: 0.8313875198364258
Predictions: [[0.9494    ]
 [0.92765313]]


In [9]:
y_pred_prob = sentiment_model.predict(X_test)  # Predict probabilities for X_test
y_pred = (y_pred_prob > 0.5).astype(int) 



In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [13]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [14]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Accuracy: 0.8314
Precision: 0.8288
Recall: 0.8367
F1-Score: 0.8327


In [15]:
# Classification report for detailed metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))


Classification Report:
               precision    recall  f1-score   support

    Negative       0.83      0.83      0.83    159494
    Positive       0.83      0.84      0.83    160506

    accuracy                           0.83    320000
   macro avg       0.83      0.83      0.83    320000
weighted avg       0.83      0.83      0.83    320000



In [16]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)


Confusion Matrix:
 [[131745  27749]
 [ 26207 134299]]
