# import libraries

In [1]:
# !pip install --upgrade scikit-learn

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [3]:
data=pd.read_csv("full_review.csv")

In [4]:
data.head()

Unnamed: 0,index,review
0,0,It was nice produt. I like it's design a lot. ...
1,1,awesome sound....very pretty to see this nd th...
2,2,awesome sound quality. pros 7-8 hrs of battery...
3,3,I think it is such a good product not only as ...
4,4,awesome bass sound quality very good bettary l...


# Set the index

In [5]:
data.set_index("index",inplace=True)

# null values

In [6]:
data.isnull().sum()

review    845
dtype: int64

In [7]:
data.dropna(inplace=True)

In [8]:
data.duplicated().sum()

13400

In [9]:
data.drop_duplicates(inplace=True)

In [10]:
data.shape

(52613, 1)

### Add a new col that can hold the text that the review is positive or negative

In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [12]:
# Make a object of sentiment class
sia=SentimentIntensityAnalyzer()

In [13]:
# Apply the Sentiment obj
data["sentiment"]=data["review"].apply(lambda x : sia.polarity_scores(x)['compound'])

# now our data look like this

In [14]:
data.head()

Unnamed: 0_level_0,review,sentiment
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,It was nice produt. I like it's design a lot. ...,0.802
1,awesome sound....very pretty to see this nd th...,0.9501
2,awesome sound quality. pros 7-8 hrs of battery...,0.931
3,I think it is such a good product not only as ...,0.9851
4,awesome bass sound quality very good bettary l...,0.9053


# Now convert the sentimnt

In [15]:
def sentiment(nbr):
    if(nbr>0):
        return "positive"
    elif(nbr<0):
        return "negative"
    else:
        return "neutral"

# Apply the fun

In [16]:
data["sentiment"]=data["sentiment"].apply(sentiment)

In [17]:
data.head(10)

Unnamed: 0_level_0,review,sentiment
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,It was nice produt. I like it's design a lot. ...,positive
1,awesome sound....very pretty to see this nd th...,positive
2,awesome sound quality. pros 7-8 hrs of battery...,positive
3,I think it is such a good product not only as ...,positive
4,awesome bass sound quality very good bettary l...,positive
5,Awsome sound powerful bass battery backup is a...,positive
6,This product sound is clear and excellent bass...,positive
7,Should u buy this---Pros:-1. Sound quality and...,positive
8,"First of all, I want to talk about sound quali...",positive
9,Good looking Super Fine clear Sound and power ...,positive


In [18]:
data["sentiment"].value_counts()

sentiment
positive    37286
negative     9869
neutral      5458
Name: count, dtype: int64

# Now perform text analysis

In [19]:
# Convert lower case
# Remove pouncation
# remove stop words
# Stem the word

# Lower case

In [20]:
def to_lower(text):
    return text.lower()

In [21]:
to_lower("sSAMi")

'ssami'

# Remove pouncation

In [22]:
import string as s
def remove_poun(text):
    new=""
    for i in text:
        if(i.isalnum()):
            new+=i
        else:
            new+=" "
    return new

In [23]:
n="sami!$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ullah"
remove_poun(n)

'sami                              ullah'

# Remove Stopword

In [24]:
from nltk.corpus import stopwords
def remove_stopword(text):
    stopword=set(stopwords.words("english"))
    
    new_text=[word for word in text.split() if word not in stopword]
    
    return " ".join(new_text)

In [25]:
remove_stopword("i am a a biy a snf i ama a good bot")

'biy snf ama good bot'

# Stem the word

In [26]:
from nltk.stem import PorterStemmer

In [27]:
port =PorterStemmer()

In [28]:
port.stem("I am a boy")

'i am a boy'

In [29]:
def stem(text):
    return port.stem(text)

In [30]:
stem("I am a boy")

'i am a boy'

# Tokenize the word

In [31]:
from nltk.tokenize import word_tokenize

In [32]:
def tokenize(text):
    return word_tokenize(text)

In [33]:
tokenize("samiullah is a good boy")

['samiullah', 'is', 'a', 'good', 'boy']

# Now all the fun in a single fun

In [34]:
# Now combne all the fun in a single fun
def text_process(text):
    # To lower fun
    text=to_lower(text)

    # remove poun fun
    text=remove_poun(text)
  
    # remove stop word fun
    text=remove_stopword(text)
    
    # Stem Fun
    text=stem(text)
    
    # Tokenize fun
    final=tokenize(text)
    
    return " ".join(final)

In [35]:
text_process("i am a good boy and you")

'good boy'

In [36]:
# data["review"]=data["review"].apply(text_process)

# Encode the label

In [37]:
from sklearn.preprocessing import LabelEncoder

In [38]:
encode=LabelEncoder()
data["sentiment"]=encode.fit_transform(data["sentiment"])

# Split the text

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
feature=data["review"]
label=data['sentiment']

In [41]:
label.head()

index
0    2
1    2
2    2
3    2
4    2
Name: sentiment, dtype: int32

In [42]:
data["sentiment"].value_counts()

sentiment
2    37286
0     9869
1     5458
Name: count, dtype: int64

In [43]:
x_train,x_test,y_train,y_test=train_test_split(feature,label,test_size=0.2,random_state=42)

In [44]:
x_train.shape

(42090,)

In [45]:
y_train.shape

(42090,)

# Build a Pipeline

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
model=MultinomialNB()

# Multinomianl Naivebase

In [48]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(preprocessor=text_process,max_features=1000)),
    ('classifier', MultinomialNB())
])

# logistic Regression

In [49]:
pipe2 = Pipeline([
    ('vectorizer', CountVectorizer(preprocessor=text_process,max_features=1000)),
    ('classifier', LogisticRegression())
])

# SVM

In [50]:
pipe3 = Pipeline([
    ('vectorizer', CountVectorizer(preprocessor=text_process,max_features=1000)),
    ('classifier', SVC())
])

# Random Forest

In [51]:
pipe_random = Pipeline([
    ('vectorizer', CountVectorizer(preprocessor=text_process,max_features=1000)),
    ('classifier', RandomForestClassifier())
])

In [52]:
pipeline.fit(x_train,y_train)

In [53]:
pipe2.fit(x_train, y_train)

In [54]:
pipe3.fit(x_train, y_train)

In [55]:
# pipe_random.fit(x_train, y_train)

In [56]:
# naivebase
pre=pipeline.predict(x_test)

# Logistic Regression
pred=pipe2.predict(x_test)

# SVC
svc_pred=pipe3.predict(x_test)

# Accuracy

In [57]:
from sklearn.metrics import accuracy_score,r2_score

In [58]:
# Naivebase
accuracy=accuracy_score(y_test,pre)

# Logistic Regression
acc=accuracy_score(y_test,pred)

# SVC
svc_acc=accuracy_score(y_test,svc_pred)

In [59]:
print("Nib base Accuracy: ",accuracy)
print("Logistic Accuracy: ",acc)
print("svc Accuracy: ",svc_acc)

Nib base Accuracy:  0.7469352846146536
Logistic Accuracy:  0.8646773733726124
svc Accuracy:  0.847762045044189


In [60]:
from sklearn.model_selection import cross_val_score

In [61]:
# cv=cross_val_score(pipeline,x_train,y_train,cv=10)

In [62]:
# cv.mean()

# Pickle the model

In [67]:
import pickle as pkl
with open("Sentiment_analysis.pkl", "wb") as f:
    pkl.dump(pipe2, f)

In [64]:
# import joblib

# # Save object
# joblib.dump(pipe3, "Sentiment_analysis.pkl")

# Load the model

In [68]:
with open("Sentiment_analysis.pkl","rb") as f:
    m=pkl.load(f)

# test on some text

In [69]:
s="you are a very slow The service was slow, the food was bland, and the overall atmosphere was disappointing."
m.predict([s])

array([0])

In [70]:
new="I absolutely loved the new movie! The storyline was captivating, the acting was superb, and the cinematography was breathtaking."

In [71]:
m.predict([new])

array([2])

In [72]:
n="The restaurant experience was terrible. The service was slow, the food was bland, and the overall atmosphere was disappointing."

In [73]:
m.predict([n])

array([0])

In [74]:
d="The seminar covered various topics related to the industry and industry area are very dirty. The speakers presented their findings, and attendees had the opportunity to ask questions during the Q&A session."

In [75]:
m.predict([d])

array([0])

In [None]:
s="you are a very slow The service was slow, the food was bland, and the overall atmosphere was disappointing."

In [76]:
m.predict([s])

array([0])

In [None]:
# text="The relentless rain outside mirrors the storm within my heart. A warm breeze, laughter in the air, and the sun shining bright – today is a day filled with happiness. The clock ticks steadily as I sit by the window, lost in thought, contemplating the passing moments."

In [None]:
# def full_process(text):
#     processed_sentences = []
#     for sentence in text.split("."):
#         processed_sentence = text_process(sentence)
#         processed_sentences.append(processed_sentence)
#     return " ".join(processed_sentences)

In [None]:
# full_process(text)