<a href="https://colab.research.google.com/github/SujayNaik17/sentiment-classification/blob/main/Notebooks/Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Mount Drive and Load Dataset

In [1]:
# Import libraries
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set path to the cleaned CSV
file_path = '/content/drive/MyDrive/Sentiment Analysis/Web Scrapper/cleaned_data.csv'

# Load the cleaned dataset
df = pd.read_csv(file_path)

# Clean column names (strip spaces just in case)
df.columns = df.columns.str.strip()

# Preview the dataset
print("Cleaned dataset loaded!")
display(df.head())


Mounted at /content/drive
Cleaned dataset loaded!


Unnamed: 0,Product Name,Product ID,Review Text,Review Rating,Reviewer Verified,Sentiment Label
0,MILTON Flip lid 1000 Thermosteel Water Bottle,itm50665b7f93ce2,best class,5,Yes,positive
1,MILTON Flip lid 1000 Thermosteel Water Bottle,itm50665b7f93ce2,nice,5,Yes,positive
2,MILTON Flip lid 1000 Thermosteel Water Bottle,itm50665b7f93ce2,good quality,5,Yes,positive
3,MILTON Flip lid 1000 Thermosteel Water Bottle,itm50665b7f93ce2,weast,1,Yes,negative
4,MILTON Flip lid 1000 Thermosteel Water Bottle,itm50665b7f93ce2,super,5,Yes,positive


##Vectorizing the Text Data

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Remove rows where 'Review Text' is missing or NaN
df = df.dropna(subset=['Review Text'])

X = df['Review Text']
y = df['Sentiment Label']

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

print(" TF-IDF Vectorization complete.")
print("TF-IDF Matrix shape:", X_tfidf.shape)


 TF-IDF Vectorization complete.
TF-IDF Matrix shape: (8796, 4930)


## Training

In [3]:
from sklearn.model_selection import train_test_split

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

print("Train-test split done.")
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Train-test split done.
Training samples: 7036
Testing samples: 1760


## SVM

In [15]:
from sklearn.svm import SVC
import joblib

# Train SVC with probability estimates enabled
svc_model = SVC(kernel='linear', probability=True, random_state=42)
svc_model.fit(X_train, y_train)

print("SVC model trained successfully.")


svc_model_path = '/content/drive/MyDrive/Sentiment Analysis/Web Scrapper/svc_model.pkl'
joblib.dump(svc_model, svc_model_path)

print(f"SVC model saved at: {svc_model_path}")

SVC model trained successfully.
SVC model saved at: /content/drive/MyDrive/Sentiment Analysis/Web Scrapper/svc_model.pkl


### Accuracy

In [7]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_svm = svc_model.predict(X_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.4f}")

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.8386

Classification Report:

              precision    recall  f1-score   support

    negative       0.84      0.83      0.83       579
     neutral       0.57      0.11      0.18       148
    positive       0.84      0.95      0.89      1033

    accuracy                           0.84      1760
   macro avg       0.75      0.63      0.64      1760
weighted avg       0.82      0.84      0.81      1760



### Testing

In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt_tab')


stop_words = set(stopwords.words('english'))

# Preprocessing function (same as training)
def clean_text(text):
    text = text.lower()
    text = text.encode('ascii', 'ignore').decode('ascii')  # remove emojis and non-English
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)      # remove digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove anything else
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def predict_sentiment(text):
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    prediction = svc_model.predict(vector)[0]

    # Get predicted probability
    proba = svc_model.predict_proba(vector)[0]
    class_index = svc_model.classes_.tolist().index(prediction)
    confidence = proba[class_index]

    print("Input:", text)
    print("Predicted Sentiment:", prediction)
    print("Confidence Score:", round(confidence, 4))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [14]:
predict_sentiment("Absolutely loved the product.")
predict_sentiment("Worst purchase ever. Very low quality.")
predict_sentiment("It's fine. Does the job.")


Input: Absolutely loved the product.
Predicted Sentiment: positive
Confidence Score: 0.8696
Input: Worst purchase ever. Very low quality.
Predicted Sentiment: negative
Confidence Score: 0.978
Input: It's fine. Does the job.
Predicted Sentiment: positive
Confidence Score: 0.9061
