In [49]:
import pandas as pd
data=pd.read_csv("Dataset-SA.csv")
data=data.drop(columns=["Rate","product_name","product_price"])
len(data)


205052

In [50]:
data['Review'] = data['Review'] + '.' + data['Summary']
data=data.drop(columns=["Summary"])
data.head()

Unnamed: 0,Review,Sentiment
0,super!.great cooler excellent air flow and for...,positive
1,awesome.best budget 2 fit cooler nice cooling,positive
2,fair.the quality is good but the power of air ...,positive
3,useless product.very bad product its a only a fan,negative
4,fair.ok ok product,neutral


In [51]:

sample_sizes = {'positive': 4000, 'negative': 4000, 'neutral': 4000}
balanced_sample = pd.DataFrame()

for sentiment, size in sample_sizes.items():
    sampled_data = data[data['Sentiment'] == sentiment].sample(n=size, random_state=42)
    balanced_sample = pd.concat([balanced_sample, sampled_data])

balanced_sample = balanced_sample.sample(frac=1, random_state=42).reset_index(drop=True)



data=balanced_sample.dropna()
print(data['Sentiment'].value_counts())


Sentiment
positive    3532
neutral     3452
negative    3447
Name: count, dtype: int64


3452

3319 3393 3288


In [52]:
import re 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()
corpus = []
for i in data['Review']:
    rp = re.sub('[^a-zA-Z]'," ",i)
    rp = rp.lower()
    rp = rp.split()
    rp = [ps.stem(word) for word in rp if not word in set(stopwords.words('english'))]
    
    
    rp = " ".join(rp)
    corpus.append(rp)
len(corpus)

10431

In [53]:
corpus

['super great cooler excel air flow price amaz unbelievablejust love',
 'awesom best budget fit cooler nice cool',
 'fair qualiti good power air decent',
 'useless product bad product fan',
 'fair ok ok product',
 'awesom cooler realli fantast provid good air flow highli recommend',
 'highli recommend good product',
 'nice nice',
 'unsatisfactori bad cooler',
 'worth money good',
 'great product beauti product good materi perfectli work',
 'mind blow purchas awesom',
 'highli recommend good',
 'brilliant wonder product must buy',
 'classi product nice air cooler smart cool breez produc',
 'must buy awsm',
 'fabul nice product',
 'worth everi penni great cooler',
 'super nice product',
 'great product good',
 'awesom nice product',
 'worth everi penni good product',
 'wow nice product reason price',
 'awesom like',
 'terrif purchas goodd',
 'excel good product',
 'worth everi penni good product kawaleti',
 'terrif good cooler amaz beauti design afford price',
 'awesom use sinc month gre

In [53]:
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer() 
X = cv.fit_transform(corpus).toarray()
len(X)

10431

In [54]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(data['Sentiment'])
y

array([2, 0, 1, ..., 0, 2, 0])

In [55]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 42)

In [56]:
from sklearn.naive_bayes import MultinomialNB
rf_model = MultinomialNB()
rf_model.fit(X_train,y_train)

In [57]:
y_pred=rf_model.predict(X_test)
y_pred


array([2, 0, 2, ..., 0, 1, 2])

In [58]:
y_test

array([2, 0, 2, ..., 0, 0, 1])

In [59]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(rf_model,X,y,cv=5))

[0.78246287 0.77181208 0.76845638 0.76653883 0.76845638]


In [76]:
negations = ["not", "n't", "never", "no", "none", "nothing"]

def preprocess_review(review):
    text=[]
    for i in review:
        words = i.split()
        new_words = []
        negate = False
        
        for word in words:
            if word.lower() in negations:
                negate = True
                new_words.append("NOT_") 
            elif negate:
                
                new_words.append("NOT_" + word)
                negate = False
            else:
                new_words.append(word)
    
        
        processed_review = " ".join(new_words)
        
        
        processed_review = re.sub('[^a-zA-Z]', " ", processed_review)
        processed_review = processed_review.lower()
        
       
        processed_review = processed_review.split()
        processed_review = [ps.stem(word) if not word.startswith("not_") else word for word in processed_review]
        text.append(" ".join(processed_review))
    return text
    


test_review = [ "Most comfort And light weight,👍👍👍👍👍"]
processed_review = preprocess_review(test_review)
print(processed_review)
test_vector = cv.transform(processed_review).toarray()
predicted_label = rf_model.predict(test_vector)
print(predicted_label)
predicted_sentiment = le.inverse_transform(predicted_label)
print(f"Predicted Sentiment: {predicted_sentiment}")

['most comfort and light weight']
[1]
Predicted Sentiment: ['neutral']


In [72]:
import pickle
with open('rf_model_final.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

with open('count_vectorizer_final.pkl', 'wb') as cv_file:
    pickle.dump(cv, cv_file)