In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Ensure same preprocessing as previous script
def advanced_text_preprocessing(content):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    
    lemmatizer = WordNetLemmatizer()
    # Convert to lowercase and remove special characters
    lemmatized_content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)  # Remove URLs
    lemmatized_content = re.sub(r'@\w+|\#', '', lemmatized_content)  # Remove mentions and hashtag symbols
    lemmatized_content = re.sub(r'[^a-zA-Z\s]', '', lemmatized_content)  # Remove numbers and punctuation
    lemmatized_content = lemmatized_content.lower()

    # Tokenization
    words = word_tokenize(lemmatized_content)

    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    custom_stopwords = {'rt', 'via'}
    stop_words.update(custom_stopwords)
    processed_words = [
        lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2
    ]
    lemmatized_content = ' '.join(processed_words)
    return lemmatized_content

In [3]:
def load_data():
    """Load and preprocess the Twitter dataset"""
    # column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
    data = pd.read_csv("Twitter_Data.csv", encoding='ISO-8859-1')
    data['category'] = data['category'].replace(-1, 2)
    return data

In [4]:
def create_deep_learning_model(vocab_size, max_length):
    """Create a deep learning model with LSTM layers"""
    model = Sequential([
        # Embedding layer to convert words to dense vector representations
        Embedding(vocab_size, 100, input_length=max_length),
        
        # LSTM layer with dropout for regularization
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        
        # Additional dense layers with dropout
        Dense(64, activation='relu'),
        Dropout(0.5),
        
        # Output layer
        Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [5]:
def prepare_deep_learning_data(texts, max_words=5000, max_length=100):
    """Prepare text data for deep learning model"""
    # Tokenize the text
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    
    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences(texts)
    
    # Pad sequences to ensure uniform length
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    
    return padded_sequences, tokenizer

In [6]:
def train_deep_learning_model(X_train, y_train, X_test, y_test):
    """Train and evaluate deep learning model"""
    # Prepare data
    max_words = 5000
    max_length = 100
    
    # Prepare sequences
    X_train_seq, tokenizer = prepare_deep_learning_data(X_train, max_words, max_length)
    X_test_seq, _ = prepare_deep_learning_data(X_test, max_words, max_length)
    
    # Ensure binary targets
    y_train = y_train.astype(float)
    y_test = y_test.astype(float)
    
    # Create and compile model
    vocab_size = len(tokenizer.word_index) + 1
    model = create_deep_learning_model(vocab_size, max_length)
    
    # Train the model
    history = model.fit(
        X_train_seq, y_train, 
        epochs=10, 
        batch_size=32, 
        validation_split=0.2,
        verbose=1
    )
    
    # Evaluate the model
    train_pred = (model.predict(X_train_seq) > 0.5).astype(int).flatten()
    test_pred = (model.predict(X_test_seq) > 0.5).astype(int).flatten()
    
    results = {
        'Deep Learning Model': {
            'train_accuracy': accuracy_score(y_train, train_pred),
            'test_accuracy': accuracy_score(y_test, test_pred),
            'classification_report': classification_report(y_test, test_pred)
        }
    }
    
    return results, model, tokenizer

In [7]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
# Load and preprocess data
    
print("Loading and preprocessing data...")
data = load_data()

Loading and preprocessing data...


In [None]:
data.head()

Unnamed: 0,clean_text,category
0,when modi promised âminimum government maxim...,2.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [10]:
# Preprocess text
# apply on 30k data
# data = data.sample(n=100000, random_state=42)

data['lemmatized_content'] = data['clean_text'].astype(str).apply(advanced_text_preprocessing)

In [11]:
data.head() 

Unnamed: 0,clean_text,category,lemmatized_content
0,when modi promised âminimum government maxim...,2.0,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporter prefix chowkidar name modi gr...
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...


In [12]:
data = data.dropna()

In [13]:
# Split data
X = data['lemmatized_content'].values
y = data['category'].values
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, stratify=y, random_state=42
)

In [14]:
data.shape

(162969, 3)

In [15]:
data.head()

Unnamed: 0,clean_text,category,lemmatized_content
0,when modi promised âminimum government maxim...,2.0,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporter prefix chowkidar name modi gr...
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...


In [16]:
# Train deep learning model
print("Training deep learning model...")

dl_results, dl_model, tokenizer = train_deep_learning_model(X_train, y_train, X_test, y_test)

Training deep learning model...
Epoch 1/10




[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 64ms/step - accuracy: 0.4442 - loss: 0.3959 - val_accuracy: 0.4401 - val_loss: 0.3676
Epoch 2/10
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 66ms/step - accuracy: 0.4453 - loss: 0.3746 - val_accuracy: 0.4401 - val_loss: 0.3708
Epoch 3/10
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 67ms/step - accuracy: 0.4432 - loss: 0.3709 - val_accuracy: 0.4401 - val_loss: 0.3677
Epoch 4/10
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 68ms/step - accuracy: 0.4434 - loss: 0.3767 - val_accuracy: 0.4401 - val_loss: 0.3679
Epoch 5/10
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 66ms/step - accuracy: 0.4420 - loss: 0.3732 - val_accuracy: 0.4401 - val_loss: 0.3676
Epoch 6/10
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 67ms/step - accuracy: 0.4437 - loss: 0.3701 - val_accuracy: 0.4401 - val_loss: 0.3677
Epoch 7/1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
for model_name, metrics in dl_results.items():
        print(f"\n{model_name} Results:")
        print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
        print(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
        print("\nClassification Report:")
        print(metrics['classification_report'])


Deep Learning Model Results:
Training Accuracy: 0.4433
Test Accuracy: 0.4433

Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     11042
         1.0       0.44      1.00      0.61     14450
         2.0       0.00      0.00      0.00      7102

    accuracy                           0.44     32594
   macro avg       0.15      0.33      0.20     32594
weighted avg       0.20      0.44      0.27     32594



# Deep Learning Model Performance Report

## Model Overview
- **Model Type**: LSTM Neural Network
- **Dataset**: Twitter Sentiment Analysis
- **Training Epochs**: 10
- **Batch Size**: 32

## Performance Metrics

### Accuracy
| Metric | Value |
|--------|-------|
| Training Accuracy | 0.4433 |
| Test Accuracy | 0.4433 |

### Classification Metrics by Class

| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| 0.0 | 0.00 | 0.00 | 0.00 | 11,042 |
| 1.0 | 0.44 | 1.00 | 0.61 | 14,450 |
| 2.0 | 0.00 | 0.00 | 0.00 | 7,102 |

### Average Metrics

| Average Type | Precision | Recall | F1-Score |
|-------------|-----------|--------|----------|
| Macro Avg | 0.15 | 0.33 | 0.20 |
| Weighted Avg | 0.20 | 0.44 | 0.27 |

## Key Observations
- Significant class imbalance
- Weak performance for classes 0.0 and 2.0
- Strong bias towards predicting class 1.0
- Overall low accuracy (44%)

## Recommendations
1. Address class imbalance:
   - Oversampling minority classes
   - Using class weights
   - Applying SMOTE
2. Investigate feature engineering
3. Explore alternative model architectures
4. Collect more balanced training data

## Potential Improvements
- Experiment with embedding dimensions
- Try alternative neural network architectures
- Implement advanced text preprocessing
- Use transfer learning with pre-trained embeddings