In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import nltk

In [2]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
def load_and_preprocess_data():
    """Load and preprocess the Twitter dataset"""
    # column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
    data = pd.read_csv("Twitter_Data.csv", encoding='ISO-8859-1')
    data['target'] = data['category'].replace(-1, 2)
    return data

In [10]:
#sample data 10,000 rows randomly
data = load_and_preprocess_data()
# data = data.sample(n=30000, random_state=42)

data.shape


(162980, 3)

In [11]:
data.head()

Unnamed: 0,clean_text,category,target
0,when modi promised âminimum government maxim...,-1.0,2.0
1,talk all the nonsense and continue all the dra...,0.0,0.0
2,what did just say vote for modi welcome bjp t...,1.0,1.0
3,asking his supporters prefix chowkidar their n...,1.0,1.0
4,answer who among these the most powerful world...,1.0,1.0


 """Main function to run the improved sentiment analysis"""

In [12]:
data.head()

Unnamed: 0,clean_text,category,target
0,when modi promised âminimum government maxim...,-1.0,2.0
1,talk all the nonsense and continue all the dra...,0.0,0.0
2,what did just say vote for modi welcome bjp t...,1.0,1.0
3,asking his supporters prefix chowkidar their n...,1.0,1.0
4,answer who among these the most powerful world...,1.0,1.0


In [13]:
data.shape

(162980, 3)

In [16]:
data = data.dropna()

In [17]:
# counting the num of missing values in the dataset
data.isnull().sum()

clean_text    0
category      0
target        0
dtype: int64

In [18]:
# checking the distribution of target col

data['target'].value_counts()

target
1.0    72249
0.0    55211
2.0    35509
Name: count, dtype: int64

In [19]:
def advanced_text_preprocessing(content):

    # Initialize lemmatizer

    lemmatizer = WordNetLemmatizer()
    # Convert to lowercase and remove special characters
    lemmatized_content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)  # Remove URLs
    lemmatized_content = re.sub(r'@\w+|\#', '', lemmatized_content)  # Remove mentions and hashtag symbols
    lemmatized_content = re.sub(r'[^a-zA-Z\s]', '', lemmatized_content)  # Remove numbers and punctuation
    lemmatized_content = lemmatized_content.lower()

    # Tokenization
    words = word_tokenize(lemmatized_content)
    
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    # Add custom stopwords that might not be useful for sentiment
    custom_stopwords = {'rt', 'via'}
    stop_words.update(custom_stopwords)
    processed_words = [
        lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2
    ]
    lemmatized_content = ' '.join(processed_words)
    return lemmatized_content

In [21]:


data['clean_text'][:3]

0    when modi promised âminimum government maxim...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
Name: clean_text, dtype: object

In [23]:
 # Apply advanced text preprocessing
    
print("Applying text preprocessing...")
    
data['lemmatized_content'] = data['clean_text'].apply(advanced_text_preprocessing)

Applying text preprocessing...


In [24]:
data['lemmatized_content'][:3]

0    modi promised minimum government maximum gover...
1               talk nonsense continue drama vote modi
2    say vote modi welcome bjp told rahul main camp...
Name: lemmatized_content, dtype: object

In [25]:
def create_advanced_tfidf():
    """Create an advanced TF-IDF vectorizer"""
    return TfidfVectorizer(
        min_df=5,  # Minimum document frequency
        max_df=0.8,  # Maximum document frequency
        ngram_range=(1, 2),  # Include both unigrams and bigrams
        sublinear_tf=True,  # Apply sublinear scaling to term frequencies
        strip_accents='unicode',
        token_pattern=r'\b\w+\b'  # Only match word characters
    )

In [27]:
def train_model(X_train, y_train, X_test, y_test):
    """Train and evaluate multiple models using GridSearchCV"""
    
    # Create pipelines for different models
    lr_pipeline = Pipeline([
        ('tfidf', create_advanced_tfidf()),
        ('classifier', LogisticRegression())
    ])
    
    rf_pipeline = Pipeline([
        ('tfidf', create_advanced_tfidf()),
        ('classifier', RandomForestClassifier())
    ])
    
    # Define parameter grids for GridSearchCV
    lr_param_grid = {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__class_weight': ['balanced', None],
        'classifier__max_iter': [1000]
    }
    
    rf_param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [10, 20, None],
        'classifier__class_weight': ['balanced', 'balanced_subsample']
    }
    
    # Perform GridSearchCV for both models
    lr_grid = GridSearchCV(lr_pipeline, lr_param_grid, cv=5, n_jobs=-1, verbose=1)
    rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, n_jobs=-1, verbose=1)
    
    # Train both models
    print("Training Logistic Regression...")
    lr_grid.fit(X_train, y_train)
    print("Training Random Forest...")
    rf_grid.fit(X_train, y_train)
    
    # Get best models
    best_lr = lr_grid.best_estimator_
    best_rf = rf_grid.best_estimator_
    
    # Evaluate models
    models = {
        'Logistic Regression': best_lr,
        'Random Forest': best_rf
    }
    
    results = {}
    for name, model in models.items():
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        
        results[name] = {
            'train_accuracy': accuracy_score(y_train, train_pred),
            'test_accuracy': accuracy_score(y_test, test_pred),
            'classification_report': classification_report(y_test, test_pred)
        }
    
    return results, best_lr if lr_grid.best_score_ > rf_grid.best_score_ else best_rf

# Split data

In [28]:
data.head()

Unnamed: 0,clean_text,category,target,lemmatized_content
0,when modi promised âminimum government maxim...,-1.0,2.0,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0.0,0.0,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,1.0,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1.0,1.0,asking supporter prefix chowkidar name modi gr...
4,answer who among these the most powerful world...,1.0,1.0,answer among powerful world leader today trump...


In [29]:
data['lemmatized_content'].values
y = data['target'].values

In [30]:
# Split data
X = data['lemmatized_content'].values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [31]:
    
# Train and evaluate models
print("Training models...")
results, best_model = train_model(X_train, y_train, X_test, y_test)
    
# Print results
for model_name, metrics in results.items():
    print(f"\n{model_name} Results:")
    print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
    print(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
    print("\nClassification Report:")
    print(metrics['classification_report'])    
    print("\n")

Training models...
Training Logistic Regression...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Training Random Forest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Logistic Regression Results:
Training Accuracy: 0.9728
Test Accuracy: 0.8791

Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.93      0.90     11042
         1.0       0.90      0.89      0.89     14450
         2.0       0.84      0.78      0.81      7102

    accuracy                           0.88     32594
   macro avg       0.87      0.87      0.87     32594
weighted avg       0.88      0.88      0.88     32594




Random Forest Results:
Training Accuracy: 0.9994
Test Accuracy: 0.8533

Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.94      0.89     11042
         1.0       0.86      0.88      0.87     14450
         2.0       0.85      0.67      0.75      7102

   

In [58]:
import pickle

In [59]:
filename = 'twitter_model_lemit.pkl'
pickle.dump(best_model, open(filename, 'wb'))

In [61]:
# loading the saved model

loaded_model = pickle.load(open('twitter_model_lemit.pkl', 'rb'))

In [63]:
X_new = X_test[5]


In [65]:
print(y_test[5])


0


In [69]:
prediction = (loaded_model.predict([X_new]))

print(prediction)

if (prediction[0]==0):
    print('Negative tweet')
else:
    print('Positive tweet')



[0]
Negative tweet


# Twitter Sentiment Analysis Model Evaluation Report

## Overview
This report presents a comprehensive evaluation of sentiment analysis models trained on a Twitter dataset, focusing on the performance of Logistic Regression and Random Forest classifiers.

## Dataset Characteristics
- **Total Samples**: 162,969
- **Class Distribution**:
  - Positive (1.0): 72,249 (44.3%)
  - Negative (0.0): 55,211 (33.9%)
  - Neutral (2.0): 35,509 (21.8%)

## Model Performance Comparison

| Metric | Logistic Regression | Random Forest |
|--------|---------------------|---------------|
| Training Accuracy | 0.9728 | 0.9994 |
| Test Accuracy | 0.8791 | 0.8533 |

### Detailed Performance Metrics

#### Logistic Regression
| Class | Precision | Recall | F1-Score |
|-------|-----------|--------|----------|
| Negative (0.0) | 0.88 | 0.93 | 0.90 |
| Positive (1.0) | 0.90 | 0.89 | 0.89 |
| Neutral (2.0) | 0.84 | 0.78 | 0.81 |

#### Random Forest
| Class | Precision | Recall | F1-Score |
|-------|-----------|--------|----------|
| Negative (0.0) | 0.85 | 0.94 | 0.89 |
| Positive (1.0) | 0.86 | 0.88 | 0.87 |
| Neutral (2.0) | 0.85 | 0.67 | 0.75 |

## Key Findings

1. **Model Accuracy**
   - Logistic Regression demonstrated slightly superior overall performance with a test accuracy of 87.91%
   - Random Forest achieved a test accuracy of 85.33%

2. **Model Complexity**
   - Logistic Regression showed more balanced performance across sentiment classes
   - Random Forest exhibited higher variance, particularly in neutral sentiment prediction

3. **Training Efficiency**
   - Logistic Regression required fewer grid search iterations (30 fits)
   - Random Forest required more computational resources (60 fits)
   - Both models showed signs of potential overfitting, with training accuracies significantly higher than test accuracies

## Preprocessing Techniques
- Advanced text preprocessing including:
  - URL removal
  - Mention and hashtag symbol removal
  - Lowercasing
  - Lemmatization
  - Stopword removal
- TF-IDF vectorization with:
  - Unigram and bigram consideration
  - Minimum document frequency of 5
  - Maximum document frequency of 0.8

## Recommendations
1. Consider ensemble methods to improve neutral sentiment classification
2. Explore additional feature engineering techniques
3. Investigate potential class imbalance mitigation strategies

## Conclusion
The Logistic Regression model provides a robust solution for Twitter sentiment analysis, offering balanced performance across sentiment categories with high interpretability and computational efficiency.

## Model Deployment
- Best performing model (Logistic Regression) saved as 'twitter_model_lemit.pkl'
- Ready for inference on new Twitter text data

## Future Work
- Experiment with deep learning approaches
- Collect more diverse training data
- Implement more advanced text representation techniques