# import libraries

In [1]:
# !pip install --upgrade scikit-learn

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [3]:
data=pd.read_csv("full_review.csv")

In [4]:
data.head()

Unnamed: 0,index,review
0,0,It was nice produt. I like it's design a lot. ...
1,1,awesome sound....very pretty to see this nd th...
2,2,awesome sound quality. pros 7-8 hrs of battery...
3,3,I think it is such a good product not only as ...
4,4,awesome bass sound quality very good bettary l...


# Set the index

In [5]:
data.set_index("index",inplace=True)

# null values

In [6]:
data.isnull().sum()

review    845
dtype: int64

In [7]:
data.dropna(inplace=True)

In [8]:
data.duplicated().sum()

13400

In [9]:
data.drop_duplicates(inplace=True)

In [10]:
data.shape

(52613, 1)

### Add a new col that can hold the text that the review is positive or negative

In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [12]:
# Make a object of sentiment class
sia=SentimentIntensityAnalyzer()

In [13]:
# Apply the Sentiment obj
data["sentiment"]=data["review"].apply(lambda x : sia.polarity_scores(x)['compound'])

# now our data look like this

In [14]:
data.head()

Unnamed: 0_level_0,review,sentiment
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,It was nice produt. I like it's design a lot. ...,0.802
1,awesome sound....very pretty to see this nd th...,0.9501
2,awesome sound quality. pros 7-8 hrs of battery...,0.931
3,I think it is such a good product not only as ...,0.9851
4,awesome bass sound quality very good bettary l...,0.9053


# Now convert the sentimnt

In [15]:
def sentiment(nbr):
    if(nbr>0):
        return "positive"
    elif(nbr<0):
        return "negative"
    else:
        return "neutral"

# Apply the fun

In [16]:
data["sentiment"]=data["sentiment"].apply(sentiment)

In [17]:
data.head(10)

Unnamed: 0_level_0,review,sentiment
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,It was nice produt. I like it's design a lot. ...,positive
1,awesome sound....very pretty to see this nd th...,positive
2,awesome sound quality. pros 7-8 hrs of battery...,positive
3,I think it is such a good product not only as ...,positive
4,awesome bass sound quality very good bettary l...,positive
5,Awsome sound powerful bass battery backup is a...,positive
6,This product sound is clear and excellent bass...,positive
7,Should u buy this---Pros:-1. Sound quality and...,positive
8,"First of all, I want to talk about sound quali...",positive
9,Good looking Super Fine clear Sound and power ...,positive


In [18]:
data["sentiment"].value_counts()

sentiment
positive    37286
negative     9869
neutral      5458
Name: count, dtype: int64

# Now perform text analysis

In [19]:
# Convert lower case
# Remove pouncation
# remove stop words
# Stem the word

# Lower case

In [20]:
def to_lower(text):
    return text.lower()

In [21]:
to_lower("sSAMi")

'ssami'

# Remove pouncation

In [96]:
import string as s

def remove_poun(text):
    new = ""
    for char in text:
        if char.isalnum():
            new += char
        else:
            new += ' '
    return new


In [97]:
n="sami!$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ullah"
remove_poun(n)

'sami                              ullah'

# Remove Stopword

In [24]:
from nltk.corpus import stopwords
def remove_stopword(text):
    stopword=set(stopwords.words("english"))
    
    new_text=[word for word in text.split() if word not in stopword]
    
    return " ".join(new_text)

In [25]:
remove_stopword("i am a a biy a snf i ama a good bot")

'biy snf ama good bot'

# Stem the word

In [26]:
from nltk.stem import PorterStemmer

In [27]:
port =PorterStemmer()

In [28]:
port.stem("I am a boy")

'i am a boy'

In [29]:
def stem(text):
    return port.stem(text)

In [30]:
stem("I am a boy")

'i am a boy'

# Tokenize the word

In [31]:
from nltk.tokenize import word_tokenize

In [32]:
def tokenize(text):
    return word_tokenize(text)

In [33]:
tokenize("samiullah is a good boy")

['samiullah', 'is', 'a', 'good', 'boy']

# Now all the fun in a single fun

In [34]:
# Now combne all the fun in a single fun
def text_process(text):
    # remove poun fun
    text=remove_poun(text)
    
    # To lower fun
    text=to_lower(text)

  
    # remove stop word fun
    text=remove_stopword(text)
    
    # Stem Fun
    text=stem(text)
    
    # Tokenize fun
    final=tokenize(text)
    
    return " ".join(final)

In [35]:
text_process("i am a good boy and you")

'good boy'

In [36]:
# data["review"]=data["review"].apply(text_process)

# Encode the label

In [37]:
from sklearn.preprocessing import LabelEncoder

In [38]:
encode=LabelEncoder()
data["sentiment"]=encode.fit_transform(data["sentiment"])

# Split the text

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
feature=data["review"]
label=data['sentiment']

In [41]:
feature_reshaped = np.array(feature).reshape(-1, 1)
feature_reshaped

array([["It was nice produt. I like it's design a lot.  It's easy to carry. And.   Looked stylish.READ MORE"],
       ['awesome sound....very pretty to see this nd the sound quality was too good I wish to take this product loved this product üòçüòçüòçREAD MORE'],
       ['awesome sound quality. pros 7-8 hrs of battery life (including 45 mins approx call time)Awesome sound output. Bass and treble are really very clear without equaliser. With equaliser, sound wary depends on the handset sound quality.Weightless to carry and in head tooMic is good, but in traffic it is not too good (3.25/5)3.5mm Option is really important to mention. Really expecting other leading brands to implement this.ConsVery tight in ears. adjusters are ok .. this ll be very tight...READ MORE'],
       ...,
       ['To remove hunger is enough'],
       ["It's good, but lately it has become very expensive."],
       ['they took good care of me']], dtype=object)

In [42]:
label.head()

index
0    2
1    2
2    2
3    2
4    2
Name: sentiment, dtype: int32

In [43]:
data["sentiment"].value_counts()

sentiment
2    37286
0     9869
1     5458
Name: count, dtype: int64

# Aboove we see that the data is totally imbalance now we can balance them

# Build a Pipeline

In [44]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline
from collections import Counter

In [45]:
# Create a dictionary for RandomUnderSampler
under_sampling_strategy = {class_label: int(0.5 * count) for class_label, count in Counter(label).items()}

In [46]:
under_sampling_strategy

{2: 18643, 0: 4934, 1: 2729}

In [47]:
resampling_pipeline = Pipeline([
    ('under', RandomUnderSampler(sampling_strategy=under_sampling_strategy)),  # Downsampling majority class
    ('over', RandomOverSampler(sampling_strategy='auto')),  # Upsampling minority classes
])

In [48]:
x_resampled, y_resampled = resampling_pipeline.fit_resample(feature_reshaped, label)

In [49]:
x_train,x_test,y_train,y_test=train_test_split(x_resampled,y_resampled,test_size=0.2,random_state=42)

In [50]:
x_train.shape

(44743, 1)

In [51]:
x_train.shape

(44743, 1)

In [52]:
x_test.shape

(11186, 1)

In [53]:
y_test.shape

(11186,)

In [54]:
y_train[y_train==2].value_counts()

sentiment
2    14959
Name: count, dtype: int64

In [55]:
y_train[y_train==1].value_counts()

sentiment
1    14886
Name: count, dtype: int64

In [56]:
y_train[y_train==0].value_counts()

sentiment
0    14898
Name: count, dtype: int64

In [57]:
y_test.shape

(11186,)

# Now the data is balance now we perform next step

In [58]:
# from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

In [59]:
model=MultinomialNB()

# Multinomianl Naivebase

In [60]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(preprocessor=text_process,max_features=1000)),
    ('classifier', MultinomialNB())
])

# logistic Regression

In [61]:
pipe2 = Pipeline([
    ('vectorizer', CountVectorizer(preprocessor=text_process,max_features=1000)),
    ('classifier', LogisticRegression())
])

# SVM

In [62]:
pipe3 = Pipeline([
    ('vectorizer', CountVectorizer(preprocessor=text_process,max_features=1000)),
    ('classifier', SVC())
])

In [63]:
pipeline.fit(x_train,y_train)

In [64]:
pipe2.fit(x_train, y_train)

In [65]:
pipe3.fit(x_train, y_train)

In [66]:
# # naivebase
pre=pipeline.predict(x_test)

# # Logistic Regression
pred=pipe2.predict(x_test)

# # SVC
svc_pred=pipe3.predict(x_test)

# Accuracy

In [67]:
from sklearn.metrics import accuracy_score,r2_score

In [68]:
# Naivebase
accuracy=accuracy_score(y_test,pre)

# Logistic Regression
acc=accuracy_score(y_test,pred)

# SVC
svc_acc=accuracy_score(y_test,svc_pred)

In [69]:
print("Nib base Accuracy: ",accuracy)
print("Logistic Accuracy: ",acc)
print("svc Accuracy: ",svc_acc)

Nib base Accuracy:  0.6827284105131415
Logistic Accuracy:  0.8509744323261219
svc Accuracy:  0.8342571070981584


In [70]:
from sklearn.model_selection import cross_val_score

In [71]:
cv=cross_val_score(pipeline,x_train,y_train,cv=10)

In [72]:
cv.mean()

0.6794579282408852

In [73]:
# cv2=cross_val_score(pipe2,x_train,y_train,cv=10)
# cv3=cross_val_score(pipe3,x_train,y_train,cv=10)

In [74]:
# print(cv2.mean(),cv3.mean())

# Pickle the model

In [75]:
import pickle as pkl
with open("Sentiment_analysis.pkl", "wb") as f:
    pkl.dump(pipe2, f)

In [76]:
# import joblib

# # Save object
# joblib.dump(pipe3, "Sentiment_analysis.pkl")

# Load the model

In [77]:
with open("Sentiment_analysis.pkl","rb") as f:
    m=pkl.load(f)

# test on some text

In [98]:
s="you are a very slow The service was slow, the food was bland, and the overall atmosphere was disappointing."
m.predict([s])

array([1])

In [114]:
new="As the relentless storm clouds gathered overhead, a palpable sense of foreboding enveloped the once serene landscape. The biting winds carried with them the stench of impending doom, and the heavens unleashed torrents of rain, washing away any semblance of tranquility. Each raindrop seemed to echo the mournful dirge of shattered dreams. The world, once vibrant with promise, now appeared draped in a shroud of desolation. Every step forward felt like an arduous journey through a murky swamp of despair, the ground sinking beneath the weight of unmet expectations. The skeletal remains of wilted flowers mirrored the decay of optimism, and the air was thick with the acrid taste of bitter disappointment. In this disheartening tableau, the once bright horizon now loomed ominously, casting a shadow that seemed to swallow the very essence of hope."

In [115]:
m.predict([new])

array([0])

In [116]:
n="The sun dipped below the horizon, casting long shadows across the quiet town. A gentle breeze rustled through the leaves, and the distant hum of crickets filled the evening air. The scent of blooming flowers mingled with the earthy aroma of damp soil, creating a serene ambiance. Streetlights flickered to life, casting a warm glow on the cobblestone streets. In this tranquil moment, time seemed to slow, and the world embraced a peaceful stillness."

In [117]:
m.predict([n])

array([1])

In [118]:
d="In the heart of the metropolis, neon lights painted the night in a kaleidoscope of colors. The vibrant nightlife unfolded, with laughter and music echoing through the streets. Each corner held a story, and the city's pulse quickened as it embraced the diversity of its nocturnal inhabitants."

In [119]:
m.predict([d])

array([1])

In [134]:
s="The movie was a complete letdown. The plot was confusing, the characters were poorly developed, and I left the theater feeling thoroughly disappointed."

In [135]:
m.predict([s])

array([1])

In [None]:
# text="The relentless rain outside mirrors the storm within my heart. A warm breeze, laughter in the air, and the sun shining bright ‚Äì today is a day filled with happiness. The clock ticks steadily as I sit by the window, lost in thought, contemplating the passing moments."