In [2]:
pip install gensim

In [12]:
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Bidirectional, LSTM, GRU, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, concatenate, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import class_weight

In [13]:
def advanced_text_preprocessing(content):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    
    lemmatizer = WordNetLemmatizer()
    # Comprehensive text cleaning
    content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)
    content = re.sub(r'@\w+', '', content)
    content = re.sub(r'\#', '', content)
    content = re.sub(r'[^a-zA-Z\s]', '', content)
    content = content.lower()

    # Advanced tokenization
    words = word_tokenize(content)

    # Enhanced stopwords and lemmatization
    stop_words = set(stopwords.words('english'))
    custom_stopwords = {'rt', 'via', 'amp', 'u', 'ur'}
    stop_words.update(custom_stopwords)
    
    processed_words = [
        lemmatizer.lemmatize(word) for word in words 
        if word not in stop_words and len(word) > 1
    ]
    return ' '.join(processed_words)

In [14]:
def create_hybrid_model(vocab_size, max_length, embedding_dim=200):
    """Create a hybrid CNN-LSTM-GRU model"""
    # Input layer
    input_layer = tf.keras.layers.Input(shape=(max_length,))
    
    # Embedding layer
    embedding = Embedding(
        vocab_size, 
        embedding_dim, 
        input_length=max_length, 
        trainable=True
    )(input_layer)
    
    # Spatial dropout
    x = SpatialDropout1D(0.3)(embedding)
    
    # Parallel processing branches
    # CNN branch
    cnn = Conv1D(128, 3, activation='relu')(x)
    cnn = MaxPooling1D(3)(cnn)
    cnn = Flatten()(cnn)
    
    # LSTM branch
    lstm = Bidirectional(LSTM(128, return_sequences=True))(x)
    lstm = GlobalAveragePooling1D()(lstm)
    
    # GRU branch
    gru = Bidirectional(GRU(64))(x)
    
    # Concatenate features
    merged = concatenate([
        cnn, 
        lstm, 
        gru
    ])
    
    # Dense layers
    x = Dense(256, activation='relu')(merged)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.4)(x)
    
    # Output layer
    output = Dense(1, activation='sigmoid')(x)
    
    # Create model
    model = Model(inputs=input_layer, outputs=output)
    
    # Compile with advanced optimizer
    model.compile(
        optimizer=Adam(learning_rate=0.0003),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [15]:
def train_advanced_model(X_train, y_train, X_test, y_test):
    # Prepare sequences
    max_words = 15000
    max_length = 200
    
    # Tokenization
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)
    
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    
    # Pad sequences
    X_train_pad = pad_sequences(
        X_train_seq, 
        maxlen=max_length, 
        padding='post', 
        truncating='post'
    )
    X_test_pad = pad_sequences(
        X_test_seq, 
        maxlen=max_length, 
        padding='post', 
        truncating='post'
    )
    
    # Compute class weights
    class_weights = class_weight.compute_class_weight(
        'balanced', 
        classes=np.unique(y_train), 
        y=y_train
    )
    class_weight_dict = dict(enumerate(class_weights))
    
    # Create model
    vocab_size = len(tokenizer.word_index) + 1
    model = create_hybrid_model(vocab_size, max_length)
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss', 
        patience=7, 
        restore_best_weights=True
    )
    
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', 
        factor=0.2, 
        patience=4, 
        min_lr=0.000001
    )
    
    # Train model
    history = model.fit(
        X_train_pad, y_train, 
        epochs=30, 
        batch_size=128, 
        validation_split=0.2,
        class_weight=class_weight_dict,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    # Evaluate
    train_pred = (model.predict(X_train_pad) > 0.5).astype(int).flatten()
    test_pred = (model.predict(X_test_pad) > 0.5).astype(int).flatten()
    
    # Detailed results
    results = {
        'Enhanced Model': {
            'train_accuracy': accuracy_score(y_train, train_pred),
            'test_accuracy': accuracy_score(y_test, test_pred),
            'classification_report': classification_report(y_test, test_pred),
            'confusion_matrix': confusion_matrix(y_test, test_pred)
        }
    }
    
    return results, model, tokenizer

In [16]:
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

True

In [17]:
# Load data
print("Loading and preprocessing data...")
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
data = pd.read_csv("twitter_dataset.csv", names=column_names, encoding='ISO-8859-1')
data['target'] = data['target'].replace(4, 1)

Loading and preprocessing data...


In [18]:
# Preprocess text

data = data.sample(n=30000, random_state=42)

data['processed_text'] = data['text'].apply(advanced_text_preprocessing)

In [19]:
# Split data
X = data['processed_text'].values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [20]:
# Train model
print("Training advanced hybrid model...")
results, model, tokenizer = train_advanced_model(X_train, y_train, X_test, y_test)

Training advanced hybrid model...




Epoch 1/30
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 996ms/step - accuracy: 0.5053 - loss: 0.6939 - val_accuracy: 0.6404 - val_loss: 0.6659 - learning_rate: 3.0000e-04
Epoch 2/30
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 1s/step - accuracy: 0.7034 - loss: 0.5815 - val_accuracy: 0.7400 - val_loss: 0.5200 - learning_rate: 3.0000e-04
Epoch 3/30
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 1s/step - accuracy: 0.8166 - loss: 0.4267 - val_accuracy: 0.7427 - val_loss: 0.5175 - learning_rate: 3.0000e-04
Epoch 4/30
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 1s/step - accuracy: 0.8531 - loss: 0.3536 - val_accuracy: 0.7423 - val_loss: 0.5603 - learning_rate: 3.0000e-04
Epoch 5/30
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 1s/step - accuracy: 0.8875 - loss: 0.2834 - val_accuracy: 0.7319 - val_loss: 0.6014 - learning_rate: 3.0000e-04
Epoch 6/30
[1m150/150[0m [32m━━━━━━━━━━

In [21]:
# Print results
for model_name, metrics in results.items():
    print(f"\n{model_name} Results:")
    print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
    print(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
    print("\nClassification Report:")
    print(metrics['classification_report'])
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])


Enhanced Model Results:
Training Accuracy: 0.8415
Test Accuracy: 0.7282

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.74      0.73      3000
           1       0.74      0.71      0.72      3000

    accuracy                           0.73      6000
   macro avg       0.73      0.73      0.73      6000
weighted avg       0.73      0.73      0.73      6000


Confusion Matrix:
[[2231  769]
 [ 862 2138]]


In [22]:
import pickle

In [23]:
filename = 'twitter_model_deeplearning_v3.pkl'
pickle.dump(model_name, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open('twitter_model_deeplearning_v3.pkl', 'rb'))



In [None]:
test = 'this is a test tweet'
prediction = loaded_model.predict(test)

AttributeError: 'str' object has no attribute 'predict'

# Twitter Sentiment Analysis Model Evaluation Report

## 1. Model Overview
**Model Type**: Hybrid CNN-LSTM-GRU Deep Learning Model
**Dataset**: Twitter Sentiment Dataset (30,000 samples)
**Task**: Binary Sentiment Classification

## 2. Model Accuracy

| Metric | Value |
|--------|-------|
| Training Accuracy | 0.8415 |
| Test Accuracy | 0.7282 |

## 3. Detailed Performance Metrics

### Classification Report

| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| Negative (0) | 0.72 | 0.74 | 0.73 | 3,000 |
| Positive (1) | 0.74 | 0.71 | 0.72 | 3,000 |
| **Macro Average** | **0.73** | **0.73** | **0.73** | **6,000** |

### Confusion Matrix

| Actual \ Predicted | Negative | Positive |
|-------------------|----------|----------|
| Negative | 2,231 | 769 |
| Positive | 862 | 2,138 |

## 4. Training Efficiency
- **Epochs Trained**: 10
- **Early Stopping**: Patience of 7 epochs
- **Learning Rate Reduction**: Applied with factor 0.2
- **Batch Size**: 128
- **Validation Split**: 0.2

## 5. Model Complexity Analysis

### Architecture Components
- **Embedding Layer**: Trainable, 200-dimensional
- **Spatial Dropout**: 0.3
- **Parallel Processing Branches**:
  1. CNN Branch: 128 filters, kernel size 3
  2. Bidirectional LSTM Branch: 128 units
  3. Bidirectional GRU Branch: 64 units
- **Dense Layers**: 
  - First layer: 256 neurons, ReLU activation
  - Second layer: 128 neurons, ReLU activation
- **Dropout Layers**: 0.5 and 0.4 regularization

## 6. Key Findings
- Model demonstrates balanced performance across positive and negative classes
- Slight overfitting observed (training accuracy higher than test accuracy)
- Effective use of hybrid architecture combining CNN, LSTM, and GRU
- Strong generalization with 72.82% test accuracy

## 7. Preprocessing Techniques
- Advanced text cleaning
- URL and special character removal
- Lowercase conversion
- Stopwords removal
- Lemmatization
- Tokenization

## 8. Recommendations
- Experiment with more data augmentation
- Try transfer learning with pre-trained embeddings
- Explore ensemble methods
- Fine-tune hyperparameters

## 9. Conclusion
The hybrid deep learning model shows promising results in Twitter sentiment analysis, with robust performance and a sophisticated multi-branch architecture.