In [143]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix
import pickle
import seaborn as sns

In [144]:
df=pd.read_csv("reviews.csv")

In [145]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Time_submitted  61594 non-null  object
 1   Review          61594 non-null  object
 2   Rating          61594 non-null  int64 
 3   Total_thumbsup  61594 non-null  int64 
 4   Reply           216 non-null    object
dtypes: int64(2), object(3)
memory usage: 2.3+ MB


In [146]:
df.isna().sum()

Time_submitted        0
Review                0
Rating                0
Total_thumbsup        0
Reply             61378
dtype: int64

In [147]:
df['Review'].value_counts()

Too many ads                                                                                                                                                                                                                36
Too much ads                                                                                                                                                                                                                15
Amazing music app                                                                                                                                                                                                            9
Very good music app                                                                                                                                                                                                          8
Great music selection                                                                                       

In [148]:
df['Rating'].value_counts()

5    22095
1    17653
4     7842
2     7118
3     6886
Name: Rating, dtype: int64

In [149]:
df.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,


In [150]:
def rating(score):
    if score>=3:
        return "Good"
    else:
        return "Bad"

In [151]:
df['Rating']=df['Rating'].apply(rating)

In [152]:
df['Rating'].value_counts()

Good    36823
Bad     24771
Name: Rating, dtype: int64

In [153]:
x=df['Review']
y=df['Rating']

In [154]:
x

0        Great music service, the audio is high quality...
1        Please ignore previous negative rating. This a...
2        This pop-up "Get the best Spotify experience o...
3          Really buggy and terrible to use as of recently
4        Dear Spotify why do I get songs that I didn't ...
                               ...                        
61589    Even though it was communicated that lyrics fe...
61590    Use to be sooo good back when I had it, and wh...
61591    This app would be good if not for it taking ov...
61592    The app is good hard to navigate and won't jus...
61593    Its good but sometimes it doesnt load the musi...
Name: Review, Length: 61594, dtype: object

In [155]:
y

0        Good
1        Good
2        Good
3         Bad
4         Bad
         ... 
61589     Bad
61590     Bad
61591     Bad
61592     Bad
61593    Good
Name: Rating, Length: 61594, dtype: object

## Remove all special and numeric character from data and also remove stopwards and apply stemming

In [156]:
import nltk 

from nltk.tokenize import word_tokenize
nltk.download("punkt")


from  nltk.corpus import stopwords
nltk.download("stopwords")

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rushi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rushi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rushi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [157]:
def clean_sent(text):
      #tokenization and case conversion
      token=word_tokenize(text.lower())

      #token--->list of tokens
      #removing non alpha char
      ftoken=[i for i in token if i.isalpha()]

      #ftoken-----> list
      sw=stopwords.words("english")
      stokens=[i for i in ftoken if i not in sw]

      #stokens--->list
      #lemmatization
      lemma=WordNetLemmatizer()
      ltoken=[lemma.lemmatize(i) for i in stokens]

      #ltoken--->list

      #joining all tokens
      return " ".join(ltoken)

In [158]:
df["clean_txt"]=df['Review'].apply(clean_sent)


In [159]:
df["clean_txt"]

0        great music service audio high quality app eas...
1        please ignore previous negative rating app sup...
2        get best spotify experience android annoying p...
3                       really buggy terrible use recently
4          dear spotify get song put playlist shuffle play
                               ...                        
61589    even though communicated lyric feature availab...
61590    use sooo good back downloaded free version cou...
61591    app would good taking device start comp spotif...
61592    app good hard navigate wo let play song click ...
61593    good sometimes doesnt load music play like sec...
Name: clean_txt, Length: 61594, dtype: object

In [160]:
x=df['clean_txt']
x

0        great music service audio high quality app eas...
1        please ignore previous negative rating app sup...
2        get best spotify experience android annoying p...
3                       really buggy terrible use recently
4          dear spotify get song put playlist shuffle play
                               ...                        
61589    even though communicated lyric feature availab...
61590    use sooo good back downloaded free version cou...
61591    app would good taking device start comp spotif...
61592    app good hard navigate wo let play song click ...
61593    good sometimes doesnt load music play like sec...
Name: clean_txt, Length: 61594, dtype: object

In [161]:
y=df['Rating']
y

0        Good
1        Good
2        Good
3         Bad
4         Bad
         ... 
61589     Bad
61590     Bad
61591     Bad
61592     Bad
61593    Good
Name: Rating, Length: 61594, dtype: object

In [162]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000)
X = cv.fit_transform(x)

In [163]:
X.shape

(61594, 5000)

In [164]:
xtrain,xtest,ytrain,ytest= train_test_split(X , y , test_size=0.25, random_state=101)

In [165]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(xtrain,ytrain)

In [166]:
ypred = mnb.predict(xtest)

In [167]:
print(accuracy_score(ytest , ypred))
print(classification_report(ytest , ypred))

0.8121955971166959
              precision    recall  f1-score   support

         Bad       0.76      0.77      0.77      6160
        Good       0.85      0.84      0.84      9239

    accuracy                           0.81     15399
   macro avg       0.80      0.81      0.80     15399
weighted avg       0.81      0.81      0.81     15399



In [168]:
pd.DataFrame(np.c_[ytest ,ypred] , columns=["Actual" , "Predicted"]).head(30)

Unnamed: 0,Actual,Predicted
0,Bad,Bad
1,Good,Good
2,Good,Good
3,Bad,Bad
4,Good,Good
5,Good,Good
6,Bad,Bad
7,Bad,Good
8,Good,Good
9,Good,Good


In [174]:
# Save  trained naive-bayes model and TfidfVectorizer

In [170]:
pickle.dump(cv , open("tfidfVectorizer.pkl" , "wb"))
pickle.dump(mnb , open("spotify_review_classification.pkl" , "wb"))

In [171]:
save_cv = pickle.load(open('tfidfVectorizer.pkl','rb'))

##define my function to the model##,'rb'))
model = pickle.load(open('spotify_review_classification.pkl','rb'))

##define function to the model


In [172]:
def test_model(sentence):
    sen = save_cv.transform([sentence]).toarray()
    res = model.predict(sen)[0]
    if res == "Good":
        return 'Positive review'
    else:
         return 'Negative review'

In [173]:
##Test review and check that what does model predict and it predicted correct

In [109]:
sen="great music service audio high quality app "
res = test_model(sen)
print(res)

Positive review


In [110]:
sen="good sometimes doesnt load music"
res=test_model(sen)
print(res)

Positive review


In [111]:
sen="Still extremely slow when changing storage" 
res=test_model(sen)
print(res)

Negative review


In [112]:
sen="love spotify usually app best others stated "
res=test_model(sen)
print(res)

Positive review
