In [1]:
%pip install pandas nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 541.6 kB/s eta 0:00:02
   ------------- -------------------------- 0.5/1.5 MB 541.6 kB/s eta 0:00:02
   -------------------- ------------------- 0.8/1.5 MB 516.5 kB/s eta 0:00:02
   -------------------- ------------------- 0.8/1.5 MB 516.5 kB/s eta 0:00:02
   -------------------- ------------------- 0.8/1.5 MB 516.5 kB/s eta 0:00:02
   --------------------------- ------------ 1.0/1.5 MB 445.6 kB/s eta 0:00:02
   --------------------------- ------------ 1.0/1.5 MB 445.6 kB/s eta 0:00:02
   --------------------------- ------------ 1.0/1.5 M

In [None]:
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load your dataset
df = pd.read_csv("../dataset/data_amazon.xlsx - Sheet1.csv")  # Replace with your actual filename

# Keep only necessary columns
df = df[['Review', 'Cons_rating']]

# Drop missing values
df.dropna(subset=['Review', 'Cons_rating'], inplace=True)

# Function to clean the review text
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"\d+", "", text)  # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    tokens = word_tokenize(text)  # tokenize
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # remove stopwords
    return ' '.join(tokens)

# Apply cleaning
df['cleaned_review'] = df['Review'].apply(clean_text)

# Generate sentiment labels
def label_sentiment(rating):
    if rating <= 2:
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

df['sentiment'] = df['Cons_rating'].apply(label_sentiment)

# Final cleaned DataFrame
cleaned_df = df[['cleaned_review', 'sentiment']]

# Save cleaned data if needed
cleaned_df.to_csv("cleaned_fashion_reviews.csv", index=False)

# Display few rows
print(cleaned_df.head())


[nltk_data] Downloading package punkt to C:\Users\FAST
[nltk_data]     LAPTOP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\FAST
[nltk_data]     LAPTOP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


                                      cleaned_review sentiment
0        absolutely wonderful silky sexy comfortable  Positive
1  love dress sooo pretty happened find store im ...  Positive
2  high hopes dress really wanted work initially ...   Neutral
3  love love love jumpsuit fun flirty fabulous ev...  Positive
4  shirt flattering due adjustable front tie perf...  Positive


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib

df=cleaned_df

# Features and Labels
X = df['cleaned_review']
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define 5 pipelines with model and TF-IDF
pipelines = {
    'LogisticRegression': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', LogisticRegression(max_iter=1000))
    ]),
    'NaiveBayes': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', MultinomialNB())
    ]),
    'SVM': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', LinearSVC())
    ]),
    'RandomForest': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', RandomForestClassifier())
    ]),
    'KNN': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', KNeighborsClassifier())
    ]),
}

# Define hyperparameters to tune for each model
param_grid = {
    'LogisticRegression': {
        'clf__C': [0.1, 1, 10]
    },
    'NaiveBayes': {
        'clf__alpha': [0.5, 1.0, 1.5]
    },
    'SVM': {
        'clf__C': [0.1, 1, 10]
    },
    'RandomForest': {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [None, 10]
    },
    'KNN': {
        'clf__n_neighbors': [3, 5, 7]
    }
}

# GridSearch over all models
best_score = 0
best_model = None
best_model_name = ""

for name, pipe in pipelines.items():
    print(f"Training {name}...")
    grid = GridSearchCV(pipe, param_grid[name], cv=3, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    acc = accuracy_score(y_test, grid.best_estimator_.predict(X_test))
    print(f"{name} Accuracy: {acc:.4f}")

    if acc > best_score:
        best_score = acc
        best_model = grid.best_estimator_
        best_model_name = name

# Save the best model
joblib.dump(best_model, f"{best_model_name}_sentiment_model.pkl")
print(f"\n✅ Best model: {best_model_name} (Accuracy: {best_score:.4f})")
print(f"📦 Model saved as: {best_model_name}_sentiment_model.pkl")


Training LogisticRegression...
LogisticRegression Accuracy: 0.8168
Training NaiveBayes...
NaiveBayes Accuracy: 0.7928
Training SVM...
SVM Accuracy: 0.8144
Training RandomForest...
RandomForest Accuracy: 0.7989
Training KNN...
KNN Accuracy: 0.7487

✅ Best model: LogisticRegression (Accuracy: 0.8168)
📦 Model saved as: LogisticRegression_sentiment_model.pkl
