In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import nltk

In [3]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
def load_and_preprocess_data():
    """Load and preprocess the Twitter dataset"""
    column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
    data = pd.read_csv("twitter_dataset.csv", names=column_names, encoding='ISO-8859-1')
    data['target'] = data['target'].replace(4, 1)
    return data

In [42]:
#sample data 10,000 rows randomly
data = load_and_preprocess_data()
data = data.sample(n=30000, random_state=42)
data.head()
data.shape


(30000, 6)

 """Main function to run the improved sentiment analysis"""

In [None]:
# Load and preprocess data
# print("Loading and preprocessing data...")

# data = load_and_preprocess_data()
# data.shape

Loading and preprocessing data...


(1600000, 6)

In [43]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text
541200,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!
750,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo..."
766711,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...
285055,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...
705995,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem


In [44]:
data.shape

(30000, 6)

In [46]:
# counting the num of missing values in the dataset
data.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [47]:
# checking the distribution of target col

data['target'].value_counts()

target
1    15001
0    14999
Name: count, dtype: int64

In [48]:
def advanced_text_preprocessing(content):

    # Initialize lemmatizer

    lemmatizer = WordNetLemmatizer()
    # Convert to lowercase and remove special characters
    lemmatized_content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)  # Remove URLs
    lemmatized_content = re.sub(r'@\w+|\#', '', lemmatized_content)  # Remove mentions and hashtag symbols
    lemmatized_content = re.sub(r'[^a-zA-Z\s]', '', lemmatized_content)  # Remove numbers and punctuation
    lemmatized_content = lemmatized_content.lower()

    # Tokenization
    words = word_tokenize(lemmatized_content)
    
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    # Add custom stopwords that might not be useful for sentiment
    custom_stopwords = {'rt', 'via'}
    stop_words.update(custom_stopwords)
    processed_words = [
        lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2
    ]
    lemmatized_content = ' '.join(processed_words)
    return lemmatized_content

In [49]:


data['text'][:3]

541200               @chrishasboobs AHHH I HOPE YOUR OK!!! 
750       @misstoriblack cool , i have no tweet apps  fo...
766711    @TiannaChaos i know  just family drama. its la...
Name: text, dtype: object

In [50]:
 # Apply advanced text preprocessing
    
print("Applying text preprocessing...")
    
data['lemmatized_content'] = data['text'].apply(advanced_text_preprocessing)

Applying text preprocessing...


In [51]:
data['lemmatized_content'][:3]

541200                                            ahhh hope
750                                    cool tweet apps razr
766711    know family drama lamehey next time hang kim g...
Name: lemmatized_content, dtype: object

In [52]:
def create_advanced_tfidf():
    """Create an advanced TF-IDF vectorizer"""
    return TfidfVectorizer(
        min_df=5,  # Minimum document frequency
        max_df=0.8,  # Maximum document frequency
        ngram_range=(1, 2),  # Include both unigrams and bigrams
        sublinear_tf=True,  # Apply sublinear scaling to term frequencies
        strip_accents='unicode',
        token_pattern=r'\b\w+\b'  # Only match word characters
    )

In [53]:
def train_model(X_train, y_train, X_test, y_test):
    """Train and evaluate multiple models using GridSearchCV"""
    
    # Create pipelines for different models
    lr_pipeline = Pipeline([
        ('tfidf', create_advanced_tfidf()),
        ('classifier', LogisticRegression())
    ])
    
    rf_pipeline = Pipeline([
        ('tfidf', create_advanced_tfidf()),
        ('classifier', RandomForestClassifier())
    ])
    
    # Define parameter grids for GridSearchCV
    lr_param_grid = {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__class_weight': ['balanced', None],
        'classifier__max_iter': [1000]
    }
    
    rf_param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [10, 20, None],
        'classifier__class_weight': ['balanced', 'balanced_subsample']
    }
    
    # Perform GridSearchCV for both models
    lr_grid = GridSearchCV(lr_pipeline, lr_param_grid, cv=5, n_jobs=-1, verbose=1)
    rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, n_jobs=-1, verbose=1)
    
    # Train both models
    print("Training Logistic Regression...")
    lr_grid.fit(X_train, y_train)
    print("Training Random Forest...")
    rf_grid.fit(X_train, y_train)
    
    # Get best models
    best_lr = lr_grid.best_estimator_
    best_rf = rf_grid.best_estimator_
    
    # Evaluate models
    models = {
        'Logistic Regression': best_lr,
        'Random Forest': best_rf
    }
    
    results = {}
    for name, model in models.items():
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        
        results[name] = {
            'train_accuracy': accuracy_score(y_train, train_pred),
            'test_accuracy': accuracy_score(y_test, test_pred),
            'classification_report': classification_report(y_test, test_pred)
        }
    
    return results, best_lr if lr_grid.best_score_ > rf_grid.best_score_ else best_rf

# Split data

In [54]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text,lemmatized_content
541200,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!,ahhh hope
750,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo...",cool tweet apps razr
766711,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...,know family drama lamehey next time hang kim g...
285055,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...,school email wont open geography stuff revise ...
705995,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem,upper airway problem


In [55]:
data['lemmatized_content'].values
y = data['target'].values

In [56]:
# Split data
X = data['lemmatized_content'].values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [57]:
    
# Train and evaluate models
print("Training models...")
results, best_model = train_model(X_train, y_train, X_test, y_test)
    
# Print results
for model_name, metrics in results.items():
    print(f"\n{model_name} Results:")
    print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
    print(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
    print("\nClassification Report:")
    print(metrics['classification_report'])    
    print("\n")

Training models...
Training Logistic Regression...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Training Random Forest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Logistic Regression Results:
Training Accuracy: 0.8094
Test Accuracy: 0.7438

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.73      0.74      3000
           1       0.74      0.76      0.75      3000

    accuracy                           0.74      6000
   macro avg       0.74      0.74      0.74      6000
weighted avg       0.74      0.74      0.74      6000




Random Forest Results:
Training Accuracy: 0.9840
Test Accuracy: 0.7277

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.72      0.73      3000
           1       0.73      0.73      0.73      3000

    accuracy                           0.73      6000
   macro avg       0.73      0.73      0.73      6000
wei

In [58]:
import pickle

In [59]:
filename = 'twitter_model_lemit.pkl'
pickle.dump(best_model, open(filename, 'wb'))

In [61]:
# loading the saved model

loaded_model = pickle.load(open('twitter_model_lemit.pkl', 'rb'))

In [63]:
X_new = X_test[5]


In [65]:
print(y_test[5])


0


In [69]:
prediction = (loaded_model.predict([X_new]))

print(prediction)

if (prediction[0]==0):
    print('Negative tweet')
else:
    print('Positive tweet')



[0]
Negative tweet


# Sentiment Analysis Model Comparison Report

## Overview
This report presents a comparative analysis of Logistic Regression and Random Forest models for Twitter sentiment classification using advanced text preprocessing and TF-IDF vectorization.

## 1. Model Performance Comparison

| Metric | Logistic Regression | Random Forest |
|--------|---------------------|---------------|
| **Training Accuracy** | 0.8094 | 0.9840 |
| **Test Accuracy** | 0.7438 | 0.7277 |
| **Precision (Negative)** | 0.75 | 0.73 |
| **Precision (Positive)** | 0.74 | 0.73 |
| **Recall (Negative)** | 0.73 | 0.72 |
| **Recall (Positive)** | 0.76 | 0.73 |
| **F1-Score (Negative)** | 0.74 | 0.73 |
| **F1-Score (Positive)** | 0.75 | 0.73 |

## 2. Detailed Performance Analysis

### 2.1 Accuracy
- Both models demonstrate comparable performance on the test set, with Logistic Regression slightly outperforming Random Forest.
- Logistic Regression achieved a test accuracy of 74.38%
- Random Forest achieved a test accuracy of 72.77%

### 2.2 Bias and Variance
- Logistic Regression shows less overfitting:
  - Training Accuracy: 80.94%
  - Test Accuracy: 74.38%
  - Variance Gap: ~6.56%

- Random Forest shows significant overfitting:
  - Training Accuracy: 98.40%
  - Test Accuracy: 72.77%
  - Variance Gap: ~25.63%

## 3. Model Complexity Analysis

### 3.1 Logistic Regression
- **Pros**:
  - Simpler model
  - Faster training and prediction
  - Less prone to overfitting
  - Better generalization
- **Cons**:
  - Assumes linear relationship
  - Less complex feature interactions

### 3.2 Random Forest
- **Pros**:
  - Handles non-linear relationships
  - Robust to outliers
  - Can capture complex feature interactions
- **Cons**:
  - Prone to overfitting
  - Computationally more expensive
  - Less interpretable

## 4. Key Preprocessing Techniques

### Text Preprocessing
- Lemmatization
- URL and special character removal
- Stopword elimination
- Custom stopword addition

### Feature Extraction
- TF-IDF Vectorization
  - Unigrams and bigrams
  - Minimum document frequency: 5
  - Maximum document frequency: 0.8
  - Sublinear scaling of term frequencies

## 5. Recommendations

1. **Model Selection**: 
   - Logistic Regression is recommended due to:
     - Better test accuracy
     - Lower overfitting
     - Faster computation

2. **Future Improvements**:
   - Experiment with more advanced models (e.g., BERT, RoBERTa)
   - Collect more diverse training data
   - Implement cross-validation with more folds
   - Explore ensemble methods

## 6. Conclusion
The Logistic Regression model provides a more robust and generalizable solution for Twitter sentiment analysis in this context, balancing performance and computational efficiency.

## Appendix: Preprocessing Code Snippet

```python
def advanced_text_preprocessing(content):
    lemmatizer = WordNetLemmatizer()
    # Remove URLs and special characters
    content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)
    content = re.sub(r'@\w+|\#', '', content)
    content = re.sub(r'[^a-zA-Z\s]', '', content)
    content = content.lower()

    # Tokenization and lemmatization
    words = word_tokenize(content)
    stop_words = set(stopwords.words('english'))
    custom_stopwords = {'rt', 'via'}
    stop_words.update(custom_stopwords)
    
    processed_words = [
        lemmatizer.lemmatize(word) 
        for word in words 
        if word not in stop_words and len(word) > 2
    ]
    
    return ' '.join(processed_words)
```

**Date of Analysis**: November 27, 2024
**Dataset**: Twitter Sentiment Dataset
**Sample Size**: 30,000 tweets