In [28]:
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [29]:
# Ensure same preprocessing as previous script
def advanced_text_preprocessing(content):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    
    lemmatizer = WordNetLemmatizer()
    # Convert to lowercase and remove special characters
    lemmatized_content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)  # Remove URLs
    lemmatized_content = re.sub(r'@\w+|\#', '', lemmatized_content)  # Remove mentions and hashtag symbols
    lemmatized_content = re.sub(r'[^a-zA-Z\s]', '', lemmatized_content)  # Remove numbers and punctuation
    lemmatized_content = lemmatized_content.lower()

    # Tokenization
    words = word_tokenize(lemmatized_content)

    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    custom_stopwords = {'rt', 'via'}
    stop_words.update(custom_stopwords)
    processed_words = [
        lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2
    ]
    lemmatized_content = ' '.join(processed_words)
    return lemmatized_content

In [30]:
def load_data():
    """Load and preprocess the Twitter dataset"""
    column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
    data = pd.read_csv("twitter_dataset.csv", names=column_names, encoding='ISO-8859-1')
    data['target'] = data['target'].replace(4, 1)
    return data

In [32]:
def create_deep_learning_model(vocab_size, max_length):
    """Create a deep learning model with LSTM layers"""
    model = Sequential([
        # Embedding layer to convert words to dense vector representations
        Embedding(vocab_size, 100, input_length=max_length),
        
        # LSTM layer with dropout for regularization
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        
        # Additional dense layers with dropout
        Dense(64, activation='relu'),
        Dropout(0.5),
        
        # Output layer
        Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [33]:
def prepare_deep_learning_data(texts, max_words=5000, max_length=100):
    """Prepare text data for deep learning model"""
    # Tokenize the text
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    
    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences(texts)
    
    # Pad sequences to ensure uniform length
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    
    return padded_sequences, tokenizer

In [34]:
def train_deep_learning_model(X_train, y_train, X_test, y_test):
    """Train and evaluate deep learning model"""
    # Prepare data
    max_words = 5000
    max_length = 100
    
    # Prepare sequences
    X_train_seq, tokenizer = prepare_deep_learning_data(X_train, max_words, max_length)
    X_test_seq, _ = prepare_deep_learning_data(X_test, max_words, max_length)
    
    # Ensure binary targets
    y_train = y_train.astype(float)
    y_test = y_test.astype(float)
    
    # Create and compile model
    vocab_size = len(tokenizer.word_index) + 1
    model = create_deep_learning_model(vocab_size, max_length)
    
    # Train the model
    history = model.fit(
        X_train_seq, y_train, 
        epochs=10, 
        batch_size=32, 
        validation_split=0.2,
        verbose=1
    )
    
    # Evaluate the model
    train_pred = (model.predict(X_train_seq) > 0.5).astype(int).flatten()
    test_pred = (model.predict(X_test_seq) > 0.5).astype(int).flatten()
    
    results = {
        'Deep Learning Model': {
            'train_accuracy': accuracy_score(y_train, train_pred),
            'test_accuracy': accuracy_score(y_test, test_pred),
            'classification_report': classification_report(y_test, test_pred)
        }
    }
    
    return results, model, tokenizer

In [35]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [36]:
# Load and preprocess data
    
print("Loading and preprocessing data...")
data = load_data()

Loading and preprocessing data...


In [37]:
# Preprocess text
# apply on 30k data
data = data.sample(n=100000, random_state=42)

data['lemmatized_content'] = data['text'].apply(advanced_text_preprocessing)

In [24]:
# Split data
X = data['lemmatized_content'].values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, stratify=y, random_state=42
)

In [38]:
# Train deep learning model
print("Training deep learning model...")

dl_results, dl_model, tokenizer = train_deep_learning_model(X_train, y_train, X_test, y_test)

Training deep learning model...
Epoch 1/10




[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 41ms/step - accuracy: 0.5086 - loss: 0.6938 - val_accuracy: 0.5009 - val_loss: 0.6931
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 39ms/step - accuracy: 0.5027 - loss: 0.6934 - val_accuracy: 0.4991 - val_loss: 0.6932
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 42ms/step - accuracy: 0.4957 - loss: 0.6933 - val_accuracy: 0.5009 - val_loss: 0.6932
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 44ms/step - accuracy: 0.4956 - loss: 0.6932 - val_accuracy: 0.5009 - val_loss: 0.6932
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 48ms/step - accuracy: 0.5005 - loss: 0.6932 - val_accuracy: 0.5009 - val_loss: 0.6931
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 42ms/step - accuracy: 0.4993 - loss: 0.6934 - val_accuracy: 0.5009 - val_loss: 0.6932
Epoch 7/10
[1m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
for model_name, metrics in dl_results.items():
        print(f"\n{model_name} Results:")
        print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
        print(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
        print("\nClassification Report:")
        print(metrics['classification_report'])


Deep Learning Model Results:
Training Accuracy: 0.5003
Test Accuracy: 0.5003

Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      4997
         1.0       0.50      1.00      0.67      5003

    accuracy                           0.50     10000
   macro avg       0.25      0.50      0.33     10000
weighted avg       0.25      0.50      0.33     10000



# Deep Learning Sentiment Analysis Model Evaluation Report

## 1. Model Overview
**Model Type**: LSTM-based Deep Learning Model
**Dataset**: Twitter Sentiment Dataset (100,000 samples)
**Task**: Binary Sentiment Classification

## 2. Model Performance Metrics

### Accuracy
| Metric | Value |
|--------|-------|
| Training Accuracy | 0.5003 |
| Test Accuracy | 0.5003 |

### Classification Report
| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| Negative (0) | 0.00 | 0.00 | 0.00 | 4,997 |
| Positive (1) | 0.50 | 1.00 | 0.67 | 5,003 |
| **Macro Average** | **0.25** | **0.50** | **0.33** | **10,000** |

## 3. Model Architecture
- **Embedding Layer**: 100-dimensional word embeddings
- **LSTM Layer**: 128 units with 0.2 dropout
- **Dense Layer**: 64 neurons with ReLU activation
- **Output Layer**: Sigmoid activation for binary classification

## 4. Training Configuration
- **Optimizer**: Adam (learning rate: 0.001)
- **Loss Function**: Binary Cross-Entropy
- **Batch Size**: 32
- **Epochs**: 10
- **Validation Split**: 0.2

## 5. Key Observations
- Model exhibits poor performance, close to random guessing
- Significant class imbalance in predictions
- No effective learning occurred during training
- Consistent loss and accuracy across epochs suggest model failure

## 6. Potential Improvement Strategies
1. Increase model complexity
2. Use pre-trained word embeddings
3. Apply advanced regularization techniques
4. Implement data augmentation
5. Explore alternative architectures
6. Fine-tune hyperparameters

## 7. Preprocessing Techniques
- URL removal
- Special character elimination
- Lowercase conversion
- Stopwords removal
- Lemmatization

## 8. Conclusion
The current deep learning model failed to learn meaningful sentiment representations, indicating a need for significant architectural and training modifications.