In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import warnings
import nltk
import time
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
import joblib 

warnings.filterwarnings('ignore')

print("All libraries imported successfully.")

All libraries imported successfully.


In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srava\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
d = pd.read_csv('train.csv')

In [5]:
new_df = d[['Phrase', 'Sentiment']]

In [6]:
# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [7]:
# Apply the cleaning function to the Phrase column
new_df['Phrase'] = new_df['Phrase'].apply(clean_text)

In [8]:
stopset = set(stopwords.words('english'))

In [9]:
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopset])

In [10]:
new_df['Phrase'] = new_df['Phrase'].apply(remove_stopwords)

In [11]:
vectorizer = TfidfVectorizer(max_features=5000)

In [12]:
from sklearn.svm import SVC  

# 1. Separate the features (raw text phrases) and the target (sentiment labels)
X_text = new_df['Phrase'] 
y = new_df['Sentiment']

# 2. Initialize and fit the TF-IDF Vectorizer
#    (It's often good practice to limit features)
vectorizer = TfidfVectorizer(max_features=5000) 
X_tfidf = vectorizer.fit_transform(X_text) 

# 3. Save the trained vectorizer to a file
#    (I fixed the filename from 'tranform.pkl' to 'vectorizer.pkl' for clarity)
print("Saving fitted vectorizer...")
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
print("Vectorizer saved as 'vectorizer.pkl'")

# 4. Apply SMOTE to handle class imbalance on the *numerical data*
print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_tfidf, y)

# 5. Confirm the new balanced class distribution
print("Class distribution after SMOTE:")
print(y_smote.value_counts())

# --- IMPORTANT: ADDED THESE STEPS ---

# 6. Train your model on the balanced (SMOTE'd) data
print("Training sentiment model...")
from sklearn.svm import LinearSVC
model = LinearSVC()

model.fit(X_smote, y_smote)
print("Model training complete.")

# 7. Save the trained model to a file
print("Saving trained model...")
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("Model saved as 'sentiment_model.pkl'")

Saving fitted vectorizer...
Vectorizer saved as 'vectorizer.pkl'
Applying SMOTE...


  File "C:\Users\srava\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\srava\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 546, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\srava\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1022, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\srava\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1491, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Class distribution after SMOTE:
Sentiment
1    79582
2    79582
3    79582
4    79582
0    79582
Name: count, dtype: int64
Training sentiment model...
Model training complete.
Saving trained model...
Model saved as 'sentiment_model.pkl'


In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [None]:
# Training the Naive Bayes classifier
classifier1 = MultinomialNB()
start_train_time = time.time()
classifier1.fit(x_train, y_train)
end_train_time = time.time()
train_time = end_train_time - start_train_time

# Making predictions on the test set
start_pred_time = time.time()
y_pred = classifier1.predict(x_test)
end_pred_time = time.time()
pred_time = end_pred_time - start_pred_time

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

print("\n--- Model Timing Summary ---")
print(f"Total Training Time:   {train_time:.2f} seconds")
print(f"Total Prediction Time: {pred_time:.2f} seconds")

In [None]:
# Training the Logistic regression classifier
classifier6 = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, n_jobs=-1)
start_train_time = time.time()
classifier6.fit(x_train, y_train)
end_train_time = time.time()
train_time = end_train_time - start_train_time

# Making predictions on the test set
start_pred_time = time.time()
y_pred = classifier6.predict(x_test)
end_pred_time = time.time()
pred_time = end_pred_time - start_pred_time

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)
print("\n--- Model Timing Summary ---")
print(f"Total Training Time:   {train_time:.2f} seconds")
print(f"Total Prediction Time: {pred_time:.2f} seconds")

In [None]:
from sklearn.svm import LinearSVC
import time

classifier = LinearSVC(C=1.0, max_iter=1000, random_state=42)

print("Starting LinearSVC training...")
start_train_time = time.time()
classifier.fit(x_train, y_train)
end_train_time = time.time()
train_time = end_train_time - start_train_time
print(f"Training finished in: {train_time:.2f} seconds")

# Making predictions on the test set
start_pred_time = time.time()
# FIX: Use 'classifier', not 'classifier6'
y_pred = classifier.predict(x_test) 
end_pred_time = time.time()
pred_time = end_pred_time - start_pred_time

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)
print("\n--- Model Timing Summary ---")
print(f"Total Training Time:   {train_time:.2f} seconds")
print(f"Total Prediction Time: {pred_time:.2f} seconds")

In [None]:
from sklearn.linear_model import SGDClassifier
import time

classifier = SGDClassifier(loss='hinge', n_jobs=-1, random_state=42)

print("Starting SGDClassifier training...")
start_train_time = time.time()
classifier.fit(x_train, y_train)
end_train_time = time.time()
train_time = end_train_time - start_train_time
print(f"Training finished in: {train_time:.2f} seconds")

# Making predictions on the test set
start_pred_time = time.time()
# FIX: Use 'classifier', not 'classifier6'
y_pred = classifier.predict(x_test)
end_pred_time = time.time()
pred_time = end_pred_time - start_pred_time

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)
print("\n--- Model Timing Summary ---")
print(f"Total Training Time:   {train_time:.2f} seconds")
print(f"Total Prediction Time: {pred_time:.2f} seconds")

In [None]:
from lightgbm import LGBMClassifier
import time

classifier = LGBMClassifier(n_estimators=200, n_jobs=-1, random_state=42)

print("Starting LightGBM training...")
start_train_time = time.time()
classifier.fit(x_train, y_train)
end_train_time = time.time()
train_time = end_train_time - start_train_time
print(f"Training finished in: {train_time:.2f} seconds")

# Making predictions on the test set
start_pred_time = time.time()
# FIX: Use 'classifier', not 'classifier6'
y_pred = classifier.predict(x_test)
end_pred_time = time.time()
pred_time = end_pred_time - start_pred_time

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)
print("\n--- Model Timing Summary ---")
print(f"Total Training Time:   {train_time:.2f} seconds")
print(f"Total Prediction Time: {pred_time:.2f} seconds")

In [None]:
from sklearn.svm import LinearSVC
import pickle

# 1. Initialize your best model (LinearSVC)
# This was your most accurate (68.86%) and fastest model.
final_model = LinearSVC(C=1.0, max_iter=1000, random_state=42)

# 2. Train it on ALL your balanced data for the best performance
# (Note: We use X_smote and y_smote, not just x_train/y_train)
print("Training the final model on all 397,910 samples...")
start_time = time.time()
final_model.fit(X_smote, y_smote)
end_time = time.time()
print(f"Final model trained in {end_time - start_time:.2f} seconds.")

# 3. Save the trained model to a file
pickle.dump(final_model, open('sentiment_model.pkl', 'wb'))

print("Final model saved successfully as 'sentiment_model.pkl'")