In [1]:
# Import Libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
# Step 1: Load Preprocessed Dataset
file_path = "preprocessed_dataset.csv"  
data = pd.read_csv(file_path)

# Display the first few rows to verify the dataset
print("Dataset Loaded Successfully!")
print(data.head())


Dataset Loaded Successfully!
             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                            text_  
0  love well made sturdi comfort love veri pretti  
1        love great upgrad origin mine coupl year  
2      thi pillow save back love look feel pillow  
3             miss inform use great product price  
4        veri nice set good qualiti set two month  


In [3]:
# Step 2: Encode Target Labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])  

# Display the label mapping
print("Label Encoding Mapping:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


Label Encoding Mapping:
{'CG': 0, 'OR': 1}


In [4]:
# Step 3: Feature Engineering with TF-IDF
vectorizer = TfidfVectorizer(
    max_features=10000,  
    ngram_range=(1, 2),  
    max_df=0.9,  
    min_df=2 
)

# Fit and transform the text data
X = vectorizer.fit_transform(data['text_'])  
y = data['label']

# Verify the feature matrix shape
print("Feature Matrix Shape:", X.shape)


Feature Matrix Shape: (40431, 10000)


In [5]:
# Step 4: Address Class Imbalance with SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)


In [6]:
# Step 5: Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])


Training set size: 32345
Testing set size: 8087


In [7]:
# Step 6: Train SVM Model
svm_model = SVC(
    kernel='linear',  
    C=1.0,  
    random_state=42
)

print("Training the SVM model...")
svm_model.fit(X_train, y_train)
print("Model Training Completed!")


Training the SVM model...
Model Training Completed!


In [8]:
# Step 7: Evaluate the Model
y_pred = svm_model.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.90      0.90      4021
           1       0.90      0.91      0.91      4066

    accuracy                           0.90      8087
   macro avg       0.90      0.90      0.90      8087
weighted avg       0.90      0.90      0.90      8087

Accuracy: 0.90
Confusion Matrix:
 [[3613  408]
 [ 367 3699]]


In [9]:
# Step 8: Save the Model and Vectorizer
joblib.dump(svm_model, "svm_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']