In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import class_weight

In [2]:
def advanced_text_preprocessing(content):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    
    lemmatizer = WordNetLemmatizer()
    # More comprehensive preprocessing
    lemmatized_content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)
    lemmatized_content = re.sub(r'@\w+', '', lemmatized_content)  # Remove mentions
    lemmatized_content = re.sub(r'\#', '', lemmatized_content)  # Remove hashtag symbols
    lemmatized_content = re.sub(r'[^\w\s]', '', lemmatized_content)  # Remove punctuation
    lemmatized_content = lemmatized_content.lower()

    # More advanced tokenization
    words = word_tokenize(lemmatized_content)

    # Enhanced stopwords removal and lemmatization
    stop_words = set(stopwords.words('english'))
    custom_stopwords = {'rt', 'via', 'amp', 'ur', 'u'}
    stop_words.update(custom_stopwords)
    
    processed_words = [
        lemmatizer.lemmatize(word) for word in words 
        if word not in stop_words and len(word) > 1
    ]
    lemmatized_content = ' '.join(processed_words)
    return lemmatized_content

In [9]:
def load_and_preprocess_data():
    # column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
    data = pd.read_csv("Twitter_Data.csv", encoding='ISO-8859-1')
    # data['target'] = data['target'].replace(4, 1)
    return data

In [4]:
def create_advanced_deep_learning_model(vocab_size, max_length):
    """Create an advanced deep learning model"""
    model = Sequential([
        # Enhanced embedding layer
        Embedding(vocab_size, 128, input_length=max_length, trainable=True),
        
        # Spatial dropout to prevent overfitting
        SpatialDropout1D(0.3),
        
        # Bidirectional LSTM for capturing context in both directions
        Bidirectional(LSTM(
            128, 
            dropout=0.2, 
            recurrent_dropout=0.2, 
            return_sequences=True
        )),
        
        # Another LSTM layer
        Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
        
        # More dense layers with dropout
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(32, activation='relu'),
        Dropout(0.3),
        
        # Output layer
        Dense(1, activation='sigmoid')
    ])
    
    # Advanced compilation with custom learning rate
    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [5]:
def prepare_advanced_data(texts, max_words=10000, max_length=150):
    """Advanced data preparation with more tokens and longer sequences"""
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(
        sequences, 
        maxlen=max_length, 
        padding='post', 
        truncating='post'
    )
    
    return padded_sequences, tokenizer

In [6]:
def train_advanced_model(X_train, y_train, X_test, y_test):
    """Advanced training with multiple optimization techniques"""
    # Prepare sequences
    max_words = 10000
    max_length = 150
    
    X_train_seq, tokenizer = prepare_advanced_data(X_train, max_words, max_length)
    X_test_seq, _ = prepare_advanced_data(X_test, max_words, max_length)
    
    # Calculate class weights to handle imbalanced data
    class_weights = class_weight.compute_class_weight(
        'balanced', 
        classes=np.unique(y_train), 
        y=y_train
    )
    class_weight_dict = dict(enumerate(class_weights))
    
    # Create model
    vocab_size = len(tokenizer.word_index) + 1
    model = create_advanced_deep_learning_model(vocab_size, max_length)
    
    # Advanced training callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss', 
        patience=5, 
        restore_best_weights=True
    )
    
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', 
        factor=0.2, 
        patience=3, 
        min_lr=0.00001
    )
    
    # Train with advanced techniques
    history = model.fit(
        X_train_seq, y_train, 
        epochs=20, 
        batch_size=64, 
        validation_split=0.2,
        class_weight=class_weight_dict,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    # Evaluation
    train_pred = (model.predict(X_train_seq) > 0.5).astype(int).flatten()
    test_pred = (model.predict(X_test_seq) > 0.5).astype(int).flatten()
    
    results = {
        'Enhanced Deep Learning Model': {
            'train_accuracy': accuracy_score(y_train, train_pred),
            'test_accuracy': accuracy_score(y_test, test_pred),
            'classification_report': classification_report(y_test, test_pred)
        }
    }
    
    return results, model, tokenizer

In [7]:
# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
 # Load and preprocess data
print("Loading and preprocessing data...")
data = load_and_preprocess_data()

Loading and preprocessing data...


In [None]:
# Preprocess text

# data = data.sample(n=30000, random_state=42)

data['lemmatized_content'] = data['clean_text'].apply(advanced_text_preprocessing)

In [12]:
# Split data
X = data['lemmatized_content'].values
y = data['category'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [13]:
 # Train advanced model
print("Training advanced deep learning model...")

dl_results, dl_model, tokenizer = train_advanced_model(X_train, y_train, X_test, y_test)

Training advanced deep learning model...




Epoch 1/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 367ms/step - accuracy: 0.3480 - loss: 0.4785 - val_accuracy: 0.4804 - val_loss: -6.6127 - learning_rate: 5.0000e-04
Epoch 2/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 378ms/step - accuracy: 0.4519 - loss: -54.8220 - val_accuracy: 0.3335 - val_loss: -259.4646 - learning_rate: 5.0000e-04
Epoch 3/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 384ms/step - accuracy: 0.4387 - loss: -787.8546 - val_accuracy: 0.3335 - val_loss: -1247.5231 - learning_rate: 5.0000e-04
Epoch 4/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 382ms/step - accuracy: 0.3759 - loss: -3545.2664 - val_accuracy: 0.3335 - val_loss: -3541.9182 - learning_rate: 5.0000e-04
Epoch 5/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 385ms/step - accuracy: 0.3661 - loss: -8234.7119 - val_accuracy: 0.3335 - val_loss: -8195.3828 - learning_rate: 5.0000e-04
E

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
  # Print results
    
for model_name, metrics in dl_results.items():
    print(f"\n{model_name} Results:")
    print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
    print(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
    print("\nClassification Report:")
    print(metrics['classification_report'])


Enhanced Deep Learning Model Results:
Training Accuracy: 0.8078
Test Accuracy: 0.5225

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.47      0.49      3000
           1       0.52      0.58      0.55      3000

    accuracy                           0.52      6000
   macro avg       0.52      0.52      0.52      6000
weighted avg       0.52      0.52      0.52      6000



# Enhanced Deep Learning Model Performance Report

## Overview
This report provides a comprehensive analysis of the Enhanced Deep Learning Model developed for text classification.

## Model Accuracy Metrics

| Metric | Value |
|--------|-------|
| Training Accuracy | 0.8078 |
| Test Accuracy | 0.5225 |
| Overall Accuracy | 0.52 |

## Detailed Performance Metrics

### Precision and Recall by Class

| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| Negative (0) | 0.53 | 0.47 | 0.49 | 3000 |
| Positive (1) | 0.52 | 0.58 | 0.55 | 3000 |

### Macro and Weighted Averages

| Average Type | Precision | Recall | F1-Score |
|-------------|-----------|--------|----------|
| Macro Avg | 0.52 | 0.52 | 0.52 |
| Weighted Avg | 0.52 | 0.52 | 0.52 |

## Training Efficiency Analysis

| Training Metric | Details |
|----------------|----------|
| Epochs | 20 |
| Batch Size | 64 |
| Learning Rate | 0.0005 |
| Optimization Techniques | 
| - Early Stopping | Monitored validation loss |
| - Reduce Learning Rate | Adaptive learning rate reduction |
| - Class Weighting | Balanced class weights |

## Model Complexity Analysis

| Component | Configuration |
|-----------|---------------|
| Embedding Layer | 128 dimensions |
| First LSTM Layer | Bidirectional, 128 units |
| Second LSTM Layer | Bidirectional, 64 units |
| Dense Layers | 64 and 32 units with ReLU activation |
| Dropout Rates | 0.3 (Embedding), 0.2 (LSTM), 0.5 and 0.3 (Dense) |

## Key Findings and Observations

1. **Performance Discrepancy**: 
   - High training accuracy (0.8078) 
   - Significantly lower test accuracy (0.5225)
   - Indicates potential overfitting

2. **Class Balance**:
   - Balanced dataset (3000 samples per class)
   - Similar performance across positive and negative classes

3. **Model Limitations**:
   - Moderate generalization capability
   - Suggests need for further model refinement

## Recommendations

1. Reduce model complexity
2. Implement more robust regularization
3. Experiment with:
   - Different embedding techniques
   - Advanced preprocessing
   - Alternative model architectures

## Conclusion
The enhanced deep learning model shows promise but requires further optimization to improve generalization and test performance.