## NLP and Model Comparison


### Dataset Source:- https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

In [3]:
import pandas as pd
import io
import requests
from zipfile import ZipFile

# URL to the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip"

# Download the zip file
response = requests.get(url)
zip_file = ZipFile(io.BytesIO(response.content))

# Let's load and combine all three datasets
amazon_data = pd.read_csv(
    zip_file.open('sentiment labelled sentences/amazon_cells_labelled.txt'),
    delimiter='\t',
    header=None,
    names=["review", "sentiment"]
)

imdb_data = pd.read_csv(
    zip_file.open('sentiment labelled sentences/imdb_labelled.txt'),
    delimiter='\t',
    header=None,
    names=["review", "sentiment"]
)

yelp_data = pd.read_csv(
    zip_file.open('sentiment labelled sentences/yelp_labelled.txt'),
    delimiter='\t',
    header=None,
    names=["review", "sentiment"]
)

# Add a source column to track where each review came from
amazon_data['source'] = 'amazon'
imdb_data['source'] = 'imdb'
yelp_data['source'] = 'yelp'

# Combine all datasets
df = pd.concat([amazon_data, imdb_data, yelp_data], ignore_index=True)

# Display the first 5 rows
print(df.head())

# Show dataset info
print("\nDataset Info:")
print(f"Total reviews: {len(df)}")
print(f"Reviews by source: {df['source'].value_counts().to_dict()}")
print(f"Sentiment distribution: {df['sentiment'].value_counts().to_dict()}")

                                              review  sentiment  source
0  So there is no way for me to plug it in here i...          0  amazon
1                        Good case, Excellent value.          1  amazon
2                             Great for the jawbone.          1  amazon
3  Tied to charger for conversations lasting more...          0  amazon
4                                  The mic is great.          1  amazon

Dataset Info:
Total reviews: 2748
Reviews by source: {'amazon': 1000, 'yelp': 1000, 'imdb': 748}
Sentiment distribution: {1: 1386, 0: 1362}


### Exercise 1: Basic Text Preprocessing with CountVectorizer

#### Goal: Create a count vectorizer without using sklearn to understand how word frequencies work.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Initialize CountVectorizer
vectorizer = CountVectorizer()

# Step 2: Fit and transform the reviews into a term-frequency matrix
tf_matrix = vectorizer.fit_transform(df["review"])

# Step 3: Convert the matrix to an array and print
print("Term-Frequency Matrix (first 5 rows):\n", tf_matrix[:5].toarray())

# Step 4: Print the feature names (vocabulary)
print("Vocabulary (first 10 words):", vectorizer.get_feature_names_out()[:10])

# Step 5: Display the size of the matrix
print("Matrix shape:", tf_matrix.shape)
print(f"Number of documents: {tf_matrix.shape[0]}")
print(f"Number of features (unique words): {tf_matrix.shape[1]}")
print(f"Total elements: {tf_matrix.shape[0] * tf_matrix.shape[1]}")
print(f"Non-zero elements: {tf_matrix.nnz}")
print(f"Sparsity: {100.0 * (1 - tf_matrix.nnz / (tf_matrix.shape[0] * tf_matrix.shape[1])):.2f}%")

Term-Frequency Matrix (first 5 rows):
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Vocabulary (first 10 words): ['00' '10' '100' '11' '12' '13' '15' '15g' '15pm' '17']
Matrix shape: (2748, 5155)
Number of documents: 2748
Number of features (unique words): 5155
Total elements: 14165940
Non-zero elements: 30275
Sparsity: 99.79%


### Exercise 2: Train a Simple Classifier with TfidfVectorizer

#### Goal: Use TfidfVectorizer to convert text into features and train a simple classifier.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Step 1: Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Step 2: Convert reviews to TF-IDF features
X = vectorizer.fit_transform(df['review'])

# Step 3: Use the sentiment column as labels
y = df['sentiment']

# Step 4: Initialize SVM classifier with RBF kernel
# C=1.0 controls regularization (how strict the boundary is)
# kernel='rbf' allows for non-linear decision boundaries
model = SVC(kernel='rbf', C=1.0, random_state=42)

# Step 5: Train the SVM model
model.fit(X,y)

# Step 6: Predict on the same data (for demonstration)
predictions = model.predict(X)

# Step 7: Evaluate accuracy
accuracy = accuracy_score(y,predictions)
print("Training Accuracy:", accuracy)

# Step 8: Display model parameters
print(f"\nSVM Parameters:")
print(f"  - Kernel: {model.kernel}")
print(f"  - C (Regularization): {model.C}")
print(f"  - Number of support vectors: {model.n_support_}")

Training Accuracy: 0.9934497816593887

SVM Parameters:
  - Kernel: rbf
  - C (Regularization): 1.0
  - Number of support vectors: [1214 1193]


### Exercise 3: Train-Test Split and Evaluation

#### Goal: Split data into training and testing sets, and evaluate model performance.

In [10]:
from sklearn.model_selection import train_test_split

# Step 1: Split the data into training and testing sets (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

# Step 2: Train SVM on the training set
model = SVC(kernel='rbf', C=1.0, random_state=42)
model.fit(X_train, y_train)

# Step 3: Predict on the test set
predictions = model.predict(X_test)

# Step 4: Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"\nTest Accuracy: {accuracy:.4f}")

# Step 5: Compare with training accuracy
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Difference (overfitting check): {train_accuracy - accuracy:.4f}")

Training samples: 1923
Testing samples: 825

Test Accuracy: 0.8121
Training Accuracy: 0.9943
Difference (overfitting check): 0.1822


### Exercise 4: Confusion Matrix and Classification Report

#### Goal: Understand precision, recall, and F1-score using sklearn's evaluation tools.

In [11]:
from sklearn.metrics import confusion_matrix, classification_report

# Step 1: Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", conf_matrix)

# Step 2: Generate a classification report
class_report = classification_report(y_test, predictions)
print("Classification Report:\n", class_report)

Confusion Matrix:
 [[349  86]
 [ 69 321]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.82       435
           1       0.79      0.82      0.81       390

    accuracy                           0.81       825
   macro avg       0.81      0.81      0.81       825
weighted avg       0.81      0.81      0.81       825



### Exercise 5: Hyperparameter Tuning with GridSearchCV

#### Goal: Use GridSearchCV to find the best hyperparameters for a model.

In [12]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# Step 1: Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1.0, 10.0],           # Regularization strength
    'gamma': ['scale', 'auto'],       # Kernel coefficient
    'kernel': ['rbf', 'linear']       # Kernel type
}

print("Parameter grid to search:")
print(f"  C values: {param_grid['C']}")
print(f"  Gamma values: {param_grid['gamma']}")
print(f"  Kernels: {param_grid['kernel']}")
print(f"  Total combinations: {len(param_grid['C']) * len(param_grid['gamma']) * len(param_grid['kernel'])}")

# Step 2: Initialize GridSearchCV
# cv=5 means 5-fold cross-validation
grid_search = GridSearchCV(
    SVC(random_state=42), 
    param_grid, 
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1  # Use all CPU cores
)

# Step 3: Fit the model (this will take 1-2 minutes)
print("\nSearching for best parameters...")
grid_search.fit(X_train, y_train)

# Step 4: Display results
print("\n" + "="*50)
print("GRID SEARCH RESULTS")
print("="*50)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")

# Step 5: Evaluate on test set with best model
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy with best params: {test_accuracy:.4f}")

# Step 6: Show top 3 parameter combinations
print("\nTop 3 parameter combinations:")
results = grid_search.cv_results_
indices = np.argsort(results['mean_test_score'])[::-1][:3]
for i, idx in enumerate(indices, 1):
    print(f"{i}. Params: {results['params'][idx]}")
    print(f"   CV Score: {results['mean_test_score'][idx]:.4f}")

Parameter grid to search:
  C values: [0.1, 1.0, 10.0]
  Gamma values: ['scale', 'auto']
  Kernels: ['rbf', 'linear']
  Total combinations: 12

Searching for best parameters...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

GRID SEARCH RESULTS
Best Parameters: {'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}
Best CV Score: 0.8315
Test Accuracy with best params: 0.8145

Top 3 parameter combinations:
1. Params: {'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}
   CV Score: 0.8315
2. Params: {'C': 1.0, 'gamma': 'auto', 'kernel': 'linear'}
   CV Score: 0.8304
3. Params: {'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}
   CV Score: 0.8304


### Exercise 6: RANDOM FOREST

In [None]:
# ========================================
# Exercise 6: Train Random Forest Classifier
# ========================================
# Goal: Build and compare Random Forest with SVM

from sklearn.ensemble import RandomForestClassifier

# TODO: Step 1 - Initialize Random Forest
# Hint: Use RandomForestClassifier with these parameters:
#   - n_estimators=100 (number of trees)
#   - max_depth=None (let trees grow fully)
#   - random_state=42 (for reproducibility)
rf_model = ...  # YOUR CODE HERE

# TODO: Step 2 - Train the Random Forest model
# Hint: Use .fit() method with X_train and y_train
...  # YOUR CODE HERE

# TODO: Step 3 - Make predictions on test set
# Hint: Use .predict() method on X_test
rf_predictions = ...  # YOUR CODE HERE

# TODO: Step 4 - Calculate accuracy
# Hint: Use accuracy_score(y_test, rf_predictions)
rf_accuracy = ...  # YOUR CODE HERE

print(f"Random Forest Test Accuracy: {rf_accuracy:.4f}")

# TODO: Step 5 - Compare with SVM
print("\n" + "="*50)
print("MODEL COMPARISON")
print("="*50)
print(f"SVM Accuracy:          {test_accuracy:.4f}")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Winner: {'Random Forest' if rf_accuracy > test_accuracy else 'SVM'}")