In [None]:

'''

Note: 
* Apply only 1 vectorization method at a time... Orelse make a copy of the X data for individual implementations


Factors to be considered : 
1. Stemming vs Lemmtization => Lemmatization is better
2. Among stemming methods, SnowBall Stemmer is better ( also known as Porter2 Stemmer )
3. Among lemmatization methods, WordNetLemmatizer better
4. CountVectorizer vectorization vs TD-IDF method ( Td-idf is better)
5. Splitting size of training & testing data => 0.20 - 0.30
6. Classifier => Logistic Regression vs Multinomial Naive Bayes

Tips:
1. Vary test size while splitting
2. For deciding classifier, use plotting the data


'''




In [None]:
''' Uploading the large dataset => Yt Link : https://www.youtube.com/watch?v=BuuH0wsJ8-k 

Syntax:
! gdown --id 12dX38oEANg_MJZF1zXACGbQPinYzR4GE
(last one is the ID of video link from drive)

'''

! gdown --id 12dX38oEANg_MJZF1zXACGbQPinYzR4GE

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

In [None]:
# drop null cols
df1 = df.dropna()

df2 = df1.copy(deep=True)

# drop duplicates from 3 cols
df2.drop_duplicates(subset=['Product Name','Rating'], inplace = True)

# removing unnecessary columns
df3 = df2.drop(labels=['Price','Review Votes'], axis = 1)


In [None]:
''' ⭐grouping by product name and average the reviews⭐ '''

# Procedure
# . Groupby Product names, Reviews
# . Groupby Product names, Ratings
# . Merge on Product names


# as_index : https://stackoverflow.com/questions/41236370/what-is-as-index-in-groupby-in-pandas
df_rating = df3.groupby("Product Name", as_index=False).mean()

df_review = df3.groupby("Product Name")["Reviews"].apply(list)
df_review = pd.DataFrame(df_review)


df_final = pd.merge(df_review, df_rating, on="Product Name", how='inner')


In [None]:
# Checking arrangement / data in reviews columns
temp = pd.DataFrame(df_final['Reviews'])
temp.head()

Unnamed: 0,Reviews
0,[I feel so LUCKY to have found this used (phon...
1,"[Phone is working on, I was planning to use it..."
2,"[all good, I love this phone; the Shine in gen..."
3,"[exelente, All around good phone, not glitchy ..."
4,[the sim card dose not read so what the point ...


In [None]:
# Checking Reviews
df_final.loc[:,['Reviews']]
df_final

# Checking after applying 1 / 0
df_final['Rating'] = df_final['Rating'].apply(lambda x:1 if x>3 else 0)

In [None]:
''' Data Cleaning '''

import re
import nltk

# nltk.download('stopwords')
# nltk.download('wordnet')

nltk.download('all')

from nltk.corpus import stopwords
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer


In [None]:

def data_cleaning( column ):
    porter = PorterStemmer()
    snow  = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()

    cleaned_data = []
    c = 0
    for review in column:

        ''' removing digits, special characters '''
        review = re.sub('[^a-zA-Z]', '  ', str( review ))

        review = review.lower().split()
        
        ''' Stop-words removal : words like 'a','in','I' '''

        ''' Stemming : to root words '''
        # review = [snow.stem(word) for word in review if word not in all_stopwords]

        ''' Lemmatization => root words & better than stemming '''
        review = [lemmatizer.lemmatize(word) for word in review if word not in all_stopwords]
        
        cleaned_data.append(' '.join(review))
    return cleaned_data

cleaned_text = data_cleaning( df_final['Reviews'] )


In [None]:
''' CountVectorizer vectorization method '''

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer( max_features= 98 )

X = cv.fit_transform( cleaned_text ).toarray()

In [None]:
'''  TD_IDF vectorization method  '''

from sklearn.feature_extraction.text import TfidfVectorizer

# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                                            max_df = 0.8,
                                            sublinear_tf = True,
                                            use_idf = True)

X = vectorizer.fit_transform( cleaned_text )

In [None]:
''' Dependent features '''

y = df_final.iloc[:,-1].values
print( y )

[0 0 1 ... 0 1 1]


In [None]:
import pickle
pickle.dump( cv, open( "countVectorizer.pkl", "wb" ) )
pickle.dump( vectorizer, open( "tfidfVectorizer.pkl", "wb" ) )

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)
print( X_train.shape , X_test.shape )

(2756, 3948) (919, 3948)


In [None]:
# Logistic Regression Model Training

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit( X_train , y_train )
y_pred = model.predict( X_test )

from sklearn.metrics import confusion_matrix, accuracy_score


cm = confusion_matrix( y_test, y_pred )
print( cm )
print( accuracy_score( y_test, y_pred ) )


''' Using CountVectorizer '''
# [[701  89]
#  [172 141]]
# 0.7633726201269265

''' Using TD-IDF '''
# [[628  23]
#  [165 103]]
# 0.795429815016322

import joblib
joblib.dump( model , './models/logisticRegression' )

[[628  23]
 [165 103]]
0.795429815016322


['./models/logisticRegression']

In [None]:
# MultinomialNB Model Training

from sklearn.naive_bayes import MultinomialNB
modelmnb = MultinomialNB()

modelmnb.fit( X_train, y_train )
y_predmnb = modelmnb.predict( X_test )

cm = confusion_matrix( y_test, y_predmnb )
print( cm )
print( accuracy_score( y_test, y_predmnb ) )
# [[649   2]
#  [253  15]]
# 0.7225244831338411

joblib.dump( modelmnb , './models/multinomialNB' )

[[649   2]
 [253  15]]
0.7225244831338411


['./models/multinomialNB']

In [None]:
# SVM Model Training

from sklearn.svm import LinearSVC
model = LinearSVC()

model.fit( X_train , y_train )
y_pred = model.predict( X_test )

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix( y_test, y_pred )
print( cm )
print( accuracy_score( y_test, y_pred ) )


''' Using CountVectorizer '''
# [[587  64]
#  [159 109]]
# 0.7573449401523396

''' Using TD-IDF '''
# [[578  73]
#  [130 138]]
# 0.779107725788901


import joblib
joblib.dump( model , './models/svmRegression' )

[[578  73]
 [130 138]]
0.779107725788901


['./models/svmRegression']

In [None]:


# Checking for user input of reviews

reviewinput = input()
cleanedinput = data_cleaning( [ reviewinput ] )

cv1 = pickle.load( open( 'countVectorizer.pkl', "rb" ) )
X_fresh_1 = cv1.transform( cleanedinput ).toarray()


cv2 = pickle.load( open( 'tfidfVectorizer.pkl', "rb") )
X_fresh_2 = cv2.transform( cleanedinput )


lgclassifier = joblib.load( './models/logisticRegression' )
mnbclassifier = joblib.load( './models/multinomialNB' )
svmclassifier = joblib.load( './models/svmRegression' )

''' Using Count vectorizor '''
# y_pred1 = lgclassifier.predict( X_fresh_1 )
# y_pred2 = mnbclassifier.predict( X_fresh_1 )

# print( y_pred1 )
# print( y_pred2 )


''' Using td-idf vectorization  '''
y_pred3 = lgclassifier.predict( X_fresh_2 )
y_pred4 = mnbclassifier.predict( X_fresh_2 )
y_pred5 = svmclassifier.predict( X_fresh_2 )

print( y_pred3 )
print( y_pred4 )
print( y_pred5 )

good
[1]
[0]
[1]
