In [38]:
#!pip install nltk



# Import the required libraires
import numpy as np
import pandas as pd

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re

# Pre-processing library
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.model_selection import train_test_split

# Models 
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline



# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# df = pd.read_csv("/kaggle/input/amazon-alexa-reviews/amazon_alexa.tsv",sep="\t")
df = pd.read_csv('./amazon_alexa.tsv', sep='\t')
print(df.shape)
df.head()

(3150, 5)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [39]:
# Drop unrelated columns
df = df.drop(['date','variation','rating'],axis=1)
df = df.drop_duplicates('verified_reviews')
print(df.describe(include='all'))
df.info()


       verified_reviews     feedback
count              2301  2301.000000
unique             2301          NaN
top       Love my Echo!          NaN
freq                  1          NaN
mean                NaN     0.910474
std                 NaN     0.285564
min                 NaN     0.000000
25%                 NaN     1.000000
50%                 NaN     1.000000
75%                 NaN     1.000000
max                 NaN     1.000000
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2301 entries, 0 to 2800
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   verified_reviews  2301 non-null   object
 1   feedback          2301 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 53.9+ KB


In [60]:
class ml_model():

    def __init__(self, vectorizer='tfidf', model='svm'):
        if vectorizer == 'tfidf':
            self.vectorizer = TfidfVectorizer()
        if model == 'svm':
            self.model = svm.SVC()
            
        self.pipeline = Pipeline([ ('vectorizer', self.vectorizer), ('model',self.model)]) 
        self.model = model
        
        self.stopwords = set(stopwords.words('english'))
        self.stopwords.remove('not')
        self.stopwords.remove('nor')
        
        
    def load_data(self, df):
        self.df = df.copy()
    
    def clean_review(self, review):
        review = word_tokenize(review)
        review = [rev.lower() for rev in review]
        review = [re.sub("[^A-Za-z0-9]",' ',rev) for rev in review]
        review = [re.sub(r'[0-9]+',' ',rev) for rev in review]
        review = " ".join(WordNetLemmatizer().lemmatize(i) for i in review if i not in self.stopwords)
    
        return review
    
    def preprocess(self, x, y=None):
        self.data = x
        self.result = y
        self.data = self.data.apply(lambda rev: self.clean_review(rev))
        
        return self.data, self.result
    
    def train(self, x, y):
        x,y = self.preprocess(x,y)
        print(self.pipeline.fit(x,y))
        
    def score(self, x, y):
        x,_ = self.preprocess(x)
        return self.pipeline.score(x,y)
        

In [61]:
x_train, x_test ,y_train,y_test = train_test_split(df['verified_reviews'],df['feedback'],test_size=0.2,random_state=42)

In [62]:
mymodel = ml_model('tfidf', 'svm')
mymodel.train(x_train, y_train)


Pipeline(steps=[('vectorizer', TfidfVectorizer()), ('model', SVC())])


In [63]:
print(mymodel.score(x_test,y_test))

0.9175704989154013
