In [1]:
import pandas as pd
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib
import scipy.sparse

In [2]:
df = pd.read_csv(r"C:\Users\priya\OneDrive\Desktop\customer_feedback_sentiment_dataset 2.csv")

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and w.isalpha()]
    return ' '.join(tokens)

In [5]:
df['Cleaned_Text'] = df['Review_Text'].apply(clean_text)

In [6]:
def rating_to_sentiment(rating):
    if rating <= 2:
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

In [7]:

df['Sentiment_Label'] = df['Rating'].apply(rating_to_sentiment)

In [8]:
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y = df['Sentiment_Label'].map(label_map)

In [9]:










tfidf = TfidfVectorizer(max_features=3000)
X_text = tfidf.fit_transform(df['Cleaned_Text'])

In [10]:
smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X_text, y)

In [11]:

X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, random_state=42)

In [12]:
model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train, y_train)

In [13]:

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_map.keys()))

Accuracy: 0.7560975609756098

Classification Report:
               precision    recall  f1-score   support

    Negative       0.77      0.80      0.79        51
     Neutral       0.84      0.77      0.80        56
    Positive       0.67      0.70      0.68        57

    accuracy                           0.76       164
   macro avg       0.76      0.76      0.76       164
weighted avg       0.76      0.76      0.76       164



In [14]:

joblib.dump(model, "sentiment_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [15]:
def predict_sentiment(text):
    cleaned = clean_text(text)
    vector = tfidf.transform([cleaned])
    pred = model.predict(vector)[0]
    inv_map = {v: k for k, v in label_map.items()}
    return inv_map[pred]