In [9]:
pip install gensim

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [10]:
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Bidirectional, LSTM, GRU, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, concatenate, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import class_weight

In [11]:
def advanced_text_preprocessing(content):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()
    # Comprehensive text cleaning
    content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)
    content = re.sub(r'@\w+', '', content)
    content = re.sub(r'\#', '', content)
    content = re.sub(r'[^a-zA-Z\s]', '', content)
    content = content.lower()

    # Advanced tokenization
    words = word_tokenize(content)

    # Enhanced stopwords and lemmatization
    stop_words = set(stopwords.words('english'))
    custom_stopwords = {'rt', 'via', 'amp', 'u', 'ur'}
    stop_words.update(custom_stopwords)

    processed_words = [
        lemmatizer.lemmatize(word) for word in words
        if word not in stop_words and len(word) > 1
    ]
    return ' '.join(processed_words)

In [12]:
def create_hybrid_model(vocab_size, max_length, embedding_dim=200):
    """Create a hybrid CNN-LSTM-GRU model"""
    # Input layer
    input_layer = tf.keras.layers.Input(shape=(max_length,))

    # Embedding layer
    embedding = Embedding(
        vocab_size,
        embedding_dim,
        input_length=max_length,
        trainable=True
    )(input_layer)

    # Spatial dropout
    x = SpatialDropout1D(0.3)(embedding)

    # Parallel processing branches
    # CNN branch
    cnn = Conv1D(128, 3, activation='relu')(x)
    cnn = MaxPooling1D(3)(cnn)
    cnn = Flatten()(cnn)

    # LSTM branch
    lstm = Bidirectional(LSTM(128, return_sequences=True))(x)
    lstm = GlobalAveragePooling1D()(lstm)

    # GRU branch
    gru = Bidirectional(GRU(64))(x)

    # Concatenate features
    merged = concatenate([
        cnn,
        lstm,
        gru
    ])

    # Dense layers
    x = Dense(256, activation='relu')(merged)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.4)(x)

    # Output layer
    output = Dense(1, activation='sigmoid')(x)

    # Create model
    model = Model(inputs=input_layer, outputs=output)

    # Compile with advanced optimizer
    model.compile(
        optimizer=Adam(learning_rate=0.0003),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

In [13]:
def train_advanced_model(X_train, y_train, X_test, y_test):
    # Prepare sequences
    max_words = 15000
    max_length = 200

    # Tokenization
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    # Pad sequences
    X_train_pad = pad_sequences(
        X_train_seq,
        maxlen=max_length,
        padding='post',
        truncating='post'
    )
    X_test_pad = pad_sequences(
        X_test_seq,
        maxlen=max_length,
        padding='post',
        truncating='post'
    )

    # Compute class weights
    class_weights = class_weight.compute_class_weight(
        'balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weight_dict = dict(enumerate(class_weights))

    # Create model
    vocab_size = len(tokenizer.word_index) + 1
    model = create_hybrid_model(vocab_size, max_length)

    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=7,
        restore_best_weights=True
    )

    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=4,
        min_lr=0.000001
    )

    # Train model
    history = model.fit(
        X_train_pad, y_train,
        epochs=30,
        batch_size=128,
        validation_split=0.2,
        class_weight=class_weight_dict,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )

    # Evaluate
    train_pred = (model.predict(X_train_pad) > 0.5).astype(int).flatten()
    test_pred = (model.predict(X_test_pad) > 0.5).astype(int).flatten()

    # Detailed results
    results = {
        'Enhanced Model': {
            'train_accuracy': accuracy_score(y_train, train_pred),
            'test_accuracy': accuracy_score(y_test, test_pred),
            'classification_report': classification_report(y_test, test_pred),
            'confusion_matrix': confusion_matrix(y_test, test_pred)
        }
    }

    return results, model, tokenizer

In [14]:
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

True

In [15]:
# Load data
print("Loading and preprocessing data...")
# column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
data = pd.read_csv("Twitter_Data.csv", encoding='ISO-8859-1')
# Remove NaN values
data = data.dropna()
# data['target'] = data['target'].replace(4, 1)

Loading and preprocessing data...


In [16]:
data.head()

Unnamed: 0,clean_text,category
0,when modi promised âminimum government maxim...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [17]:
print(data.columns)
print(data.dtypes)

Index(['clean_text', 'category'], dtype='object')
clean_text     object
category      float64
dtype: object


In [18]:
# Preprocess text

# data = data.sample(n=30000, random_state=42)

data['processed_text'] = data['clean_text'].astype(str).apply(advanced_text_preprocessing)
# data['processed_text'] = data['clean_text'].apply(advanced_text_preprocessing)

In [19]:
# Split data
X = data['processed_text'].values
y = data['category'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [20]:
# Train model
print("Training advanced hybrid model...")
results, model, tokenizer = train_advanced_model(X_train, y_train, X_test, y_test)

Training advanced hybrid model...




Epoch 1/30
[1m815/815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m884s[0m 1s/step - accuracy: 0.4207 - loss: -3185.7737 - val_accuracy: 0.5581 - val_loss: -176873.3125 - learning_rate: 3.0000e-04
Epoch 2/30
[1m815/815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m876s[0m 1s/step - accuracy: 0.5490 - loss: -904624.6250 - val_accuracy: 0.5627 - val_loss: -6904888.0000 - learning_rate: 3.0000e-04
Epoch 3/30
[1m815/815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m898s[0m 1s/step - accuracy: 0.5442 - loss: -13721802.0000 - val_accuracy: 0.5347 - val_loss: -44280652.0000 - learning_rate: 3.0000e-04
Epoch 4/30
[1m815/815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m910s[0m 1s/step - accuracy: 0.5458 - loss: -67837736.0000 - val_accuracy: 0.5457 - val_loss: -153060224.0000 - learning_rate: 3.0000e-04
Epoch 5/30
[1m815/815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m914s[0m 1s/step - accuracy: 0.5393 - loss: -210227888.0000 - val_accuracy: 0.5582 - val_loss: -381109024.0000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
# Print results
for model_name, metrics in results.items():
    print(f"\n{model_name} Results:")
    print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
    print(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
    print("\nClassification Report:")
    print(metrics['classification_report'])
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])


Enhanced Model Results:
Training Accuracy: 0.5563
Test Accuracy: 0.5484

Classification Report:
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00      7102
         0.0       0.43      1.00      0.60     11042
         1.0       0.98      0.48      0.64     14450

    accuracy                           0.55     32594
   macro avg       0.47      0.49      0.41     32594
weighted avg       0.58      0.55      0.49     32594


Confusion Matrix:
[[    0  7004    98]
 [    0 11010    32]
 [    0  7586  6864]]


In [22]:
import pickle

In [23]:
filename = 'twitter_model_deeplearning_v3.pkl'
pickle.dump(model_name, open(filename, 'wb'))

In [24]:
loaded_model = pickle.load(open('twitter_model_deeplearning_v3.pkl', 'rb'))



In [25]:
test = 'this is a test tweet'
prediction = loaded_model.predict(test)

AttributeError: 'str' object has no attribute 'predict'

# Machine Learning Model Performance Report

## Overview
**Model Type:** Hybrid CNN-LSTM-GRU Neural Network for Text Classification

## Performance Metrics

| Metric Category | Detailed Metrics | Value | Interpretation |
|----------------|-----------------|-------|----------------|
| **Model Accuracy** | Overall Accuracy | 54.84% | Moderate performance, slightly better than random guessing |
| | Training Accuracy | 55.63% | Consistent with test accuracy, minimal overfitting |

## Detailed Classification Performance

| Class | Precision | Recall | F1-Score | Support | 
|-------|-----------|--------|----------|---------|
| Negative (-1.0) | 0.00 | 0.00 | 0.00 | 7,102 |
| Neutral (0.0) | 0.43 | 1.00 | 0.60 | 11,042 |
| Positive (1.0) | 0.98 | 0.48 | 0.64 | 14,450 |

## Model Complexity Analysis

| Component | Description | Complexity |
|-----------|-------------|------------|
| Architecture | Hybrid CNN-LSTM-GRU | High |
| Embedding Dimension | 200 | Medium |
| Max Sequence Length | 200 | Medium |
| Vocabulary Size | 15,000 | Large |

## Key Findings

1. **Class Imbalance**
   - Significant performance variation across classes
   - Positive class shows high precision (0.98)
   - Negative class shows complete misclassification

2. **Model Limitations**
   - Struggles with identifying negative sentiments
   - High recall for neutral class (1.00)
   - Moderate performance for positive class

3. **Potential Improvements**
   - Address class imbalance
   - Enhance preprocessing for negative sentiment detection
   - Consider advanced sampling techniques

## Recommendations

1. Use class weights or advanced sampling techniques
2. Experiment with feature engineering
3. Explore more sophisticated preprocessing
4. Consider ensemble methods

## Confusion Matrix Breakdown

| Predicted \ Actual | Negative | Neutral | Positive |
|--------------------|----------|---------|----------|
| Negative | 0 | 7,004 | 98 |
| Neutral | 0 | 11,010 | 32 |
| Positive | 0 | 7,586 | 6,864 |

**Note:** The model shows a strong bias towards neutral classification, particularly misclassifying negative samples.