In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import class_weight

In [2]:
def advanced_text_preprocessing(content):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    
    lemmatizer = WordNetLemmatizer()
    # More comprehensive preprocessing
    lemmatized_content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)
    lemmatized_content = re.sub(r'@\w+', '', lemmatized_content)  # Remove mentions
    lemmatized_content = re.sub(r'\#', '', lemmatized_content)  # Remove hashtag symbols
    lemmatized_content = re.sub(r'[^\w\s]', '', lemmatized_content)  # Remove punctuation
    lemmatized_content = lemmatized_content.lower()

    # More advanced tokenization
    words = word_tokenize(lemmatized_content)

    # Enhanced stopwords removal and lemmatization
    stop_words = set(stopwords.words('english'))
    custom_stopwords = {'rt', 'via', 'amp', 'ur', 'u'}
    stop_words.update(custom_stopwords)
    
    processed_words = [
        lemmatizer.lemmatize(word) for word in words 
        if word not in stop_words and len(word) > 1
    ]
    lemmatized_content = ' '.join(processed_words)
    return lemmatized_content

In [3]:
def load_and_preprocess_data():
    # column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
    data = pd.read_csv("IMDB_Dataset.csv", encoding='ISO-8859-1')
    # data['target'] = data['target'].replace(4, 1)
    return data

In [4]:
def create_advanced_deep_learning_model(vocab_size, max_length):
    """Create an advanced deep learning model"""
    model = Sequential([
        # Enhanced embedding layer
        Embedding(vocab_size, 128, input_length=max_length, trainable=True),
        
        # Spatial dropout to prevent overfitting
        SpatialDropout1D(0.3),
        
        # Bidirectional LSTM for capturing context in both directions
        Bidirectional(LSTM(
            128, 
            dropout=0.2, 
            recurrent_dropout=0.2, 
            return_sequences=True
        )),
        
        # Another LSTM layer
        Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
        
        # More dense layers with dropout
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(32, activation='relu'),
        Dropout(0.3),
        
        # Output layer
        Dense(1, activation='sigmoid')
    ])
    
    # Advanced compilation with custom learning rate
    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [5]:
def prepare_advanced_data(texts, max_words=10000, max_length=150):
    """Advanced data preparation with more tokens and longer sequences"""
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(
        sequences, 
        maxlen=max_length, 
        padding='post', 
        truncating='post'
    )
    
    return padded_sequences, tokenizer

In [6]:
def train_advanced_model(X_train, y_train, X_test, y_test):
    """Advanced training with multiple optimization techniques"""
    # Prepare sequences
    max_words = 10000
    max_length = 150
    
    X_train_seq, tokenizer = prepare_advanced_data(X_train, max_words, max_length)
    X_test_seq, _ = prepare_advanced_data(X_test, max_words, max_length)
    
    # Calculate class weights to handle imbalanced data
    class_weights = class_weight.compute_class_weight(
        'balanced', 
        classes=np.unique(y_train), 
        y=y_train
    )
    class_weight_dict = dict(enumerate(class_weights))
    
    # Create model
    vocab_size = len(tokenizer.word_index) + 1
    model = create_advanced_deep_learning_model(vocab_size, max_length)
    
    # Advanced training callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss', 
        patience=5, 
        restore_best_weights=True
    )
    
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', 
        factor=0.2, 
        patience=3, 
        min_lr=0.00001
    )
    
    # Train with advanced techniques
    history = model.fit(
        X_train_seq, y_train, 
        epochs=20, 
        batch_size=64, 
        validation_split=0.2,
        class_weight=class_weight_dict,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    # Evaluation
    train_pred = (model.predict(X_train_seq) > 0.5).astype(int).flatten()
    test_pred = (model.predict(X_test_seq) > 0.5).astype(int).flatten()
    
    results = {
        'Enhanced Deep Learning Model': {
            'train_accuracy': accuracy_score(y_train, train_pred),
            'test_accuracy': accuracy_score(y_test, test_pred),
            'classification_report': classification_report(y_test, test_pred)
        }
    }
    
    return results, model, tokenizer

In [7]:
# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
 # Load and preprocess data
print("Loading and preprocessing data...")
data = load_and_preprocess_data()

Loading and preprocessing data...


In [11]:
data.head()

Unnamed: 0,review,sentiment,target
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [10]:
data['target'] = data['sentiment'].replace('positive', 1).replace('negative', 0)

  data['target'] = data['sentiment'].replace('positive', 1).replace('negative', 0)


In [12]:
data = data.dropna()

In [13]:
data.isnull().sum()

review       0
sentiment    0
target       0
dtype: int64

In [14]:
# checking the distribution of target col

data['target'].value_counts()

target
1    25000
0    25000
Name: count, dtype: int64

In [15]:
# Preprocess text

# data = data.sample(n=30000, random_state=42)

data['lemmatized_content'] = data['review'].apply(advanced_text_preprocessing)

In [16]:
# Split data
X = data['lemmatized_content'].values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [17]:
 # Train advanced model
print("Training advanced deep learning model...")

dl_results, dl_model, tokenizer = train_advanced_model(X_train, y_train, X_test, y_test)

Training advanced deep learning model...




Epoch 1/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 421ms/step - accuracy: 0.6380 - loss: 0.6047 - val_accuracy: 0.5677 - val_loss: 0.6415 - learning_rate: 5.0000e-04
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 420ms/step - accuracy: 0.7318 - loss: 0.5114 - val_accuracy: 0.7559 - val_loss: 0.5181 - learning_rate: 5.0000e-04
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 431ms/step - accuracy: 0.8551 - loss: 0.3613 - val_accuracy: 0.8650 - val_loss: 0.3372 - learning_rate: 5.0000e-04
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 433ms/step - accuracy: 0.8950 - loss: 0.2801 - val_accuracy: 0.8775 - val_loss: 0.3085 - learning_rate: 5.0000e-04
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 435ms/step - accuracy: 0.9130 - loss: 0.2367 - val_accuracy: 0.8802 - val_loss: 0.3185 - learning_rate: 5.0000e-04
Epoch 6/20
[1m500/500[0m [3

Epoch 1/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 421ms/step - accuracy: 0.6380 - loss: 0.6047 - val_accuracy: 0.5677 - val_loss: 0.6415 - learning_rate: 5.0000e-04
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 420ms/step - accuracy: 0.7318 - loss: 0.5114 - val_accuracy: 0.7559 - val_loss: 0.5181 - learning_rate: 5.0000e-04
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 431ms/step - accuracy: 0.8551 - loss: 0.3613 - val_accuracy: 0.8650 - val_loss: 0.3372 - learning_rate: 5.0000e-04
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 433ms/step - accuracy: 0.8950 - loss: 0.2801 - val_accuracy: 0.8775 - val_loss: 0.3085 - learning_rate: 5.0000e-04
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 435ms/step - accuracy: 0.9130 - loss: 0.2367 - val_accuracy: 0.8802 - val_loss: 0.3185 - learning_rate: 5.0000e-04
Epoch 6/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 427ms/step - accuracy: 0.9326 - loss: 0.1914 - val_accuracy: 0.8773 - val_loss: 0.3111 - learning_rate: 5.0000e-04
Epoch 7/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 429ms/step - accuracy: 0.9410 - loss: 0.1716 - val_accuracy: 0.8773 - val_loss: 0.3538 - learning_rate: 5.0000e-04
Epoch 8/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 430ms/step - accuracy: 0.9605 - loss: 0.1246 - val_accuracy: 0.8796 - val_loss: 0.3788 - learning_rate: 1.0000e-04
Epoch 9/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 429ms/step - accuracy: 0.9640 - loss: 0.1114 - val_accuracy: 0.8792 - val_loss: 0.4102 - learning_rate: 1.0000e-04
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 32ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 34ms/step


In [18]:
  # Print results
    
for model_name, metrics in dl_results.items():
    print(f"\n{model_name} Results:")
    print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
    print(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
    print("\nClassification Report:")
    print(metrics['classification_report'])


Enhanced Deep Learning Model Results:
Training Accuracy: 0.9209
Test Accuracy: 0.5177

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.58      0.55      5000
           1       0.52      0.45      0.48      5000

    accuracy                           0.52     10000
   macro avg       0.52      0.52      0.52     10000
weighted avg       0.52      0.52      0.52     10000




Enhanced Deep Learning Model Results:
Training Accuracy: 0.9209
Test Accuracy: 0.5177

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.58      0.55      5000
           1       0.52      0.45      0.48      5000

    accuracy                           0.52     10000
   macro avg       0.52      0.52      0.52     10000
weighted avg       0.52      0.52      0.52     10000



SyntaxError: invalid decimal literal (2931570926.py, line 25)

# Deep Learning Sentiment Analysis Performance Report

## Overall Performance Metrics

| Category | Value |
|----------|-------|
| Training Accuracy | 92.09% |
| Test Accuracy | 51.77% |
| Model Type | Bidirectional LSTM |
| Preprocessing | Advanced Lemmatization |

## Detailed Performance Metrics

| Metric | Negative (Class 0) | Positive (Class 1) | Macro Average |
|--------|-------------------|-------------------|--------------|
| Precision | 0.52 | 0.52 | 0.52 |
| Recall | 0.58 | 0.45 | 0.52 |
| F1-Score | 0.55 | 0.48 | 0.52 |

## Model Architecture Details

| Layer | Configuration |
|-------|---------------|
| Embedding | 128 dimensions |
| 1st LSTM | Bidirectional, 128 units |
| 2nd LSTM | Bidirectional, 64 units |
| Dense Layers | 64 and 32 units, ReLU activation |
| Output Layer | Sigmoid activation |

## Training Configuration

| Parameter | Value |
|-----------|-------|
| Optimizer | Adam |
| Learning Rate | 0.0005 |
| Batch Size | 64 |
| Epochs | 20 |
| Early Stopping | Patience of 5 |
| Learning Rate Reduction | Patience of 3 |

## Preprocessing Techniques

| Technique | Description |
|-----------|-------------|
| Tokenization | Word-level |
| Text Cleaning | URL removal, mention removal |
| Lemmatization | WordNet Lemmatizer |
| Stopword Removal | Enhanced custom stopwords |

## Key Observations

| Aspect | Insight |
|--------|---------|
| Overfitting | High training accuracy, low test accuracy |
| Model Complexity | Advanced architecture with multiple techniques |
| Potential Improvements | Feature engineering, regularization |

## Recommendations

| Area | Suggested Action |
|------|-----------------|
| Overfitting | Increase regularization |
| Data Preprocessing | Experiment with feature extraction |
| Model Tuning | Adjust hyperparameters |