 # Product review - NLP
 
In this project I will use data from yelp, amazon and imdb for the product review

In [3]:
import numpy as np
import pandas as pd
import nltk.corpus
import string
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Reading data

In [4]:
df0=pd.read_csv('yelp.txt',delimiter='\t',names =['text','review'])

In [5]:
df1=pd.read_csv('amazon.txt',delimiter='\t',names =['text','review'])

In [6]:
df2=pd.read_csv('imdb.txt',delimiter='\t',names =['text','review'])

In [7]:
df=pd.concat([df1,df2,df0],axis=0)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2748 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2748 non-null   object
 1   review  2748 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 64.4+ KB


In [9]:
df.head(4)

Unnamed: 0,text,review
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0


## Data Preprocessing

### Function to remove Punctuations and stopwords

In [10]:
def text_process(mess):
    """
    1. Remove punctuations.
    2. remove stop words
    3. return clean string """
    nopunc=[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    nopunc=[word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    nopunc=' '.join(nopunc)
    return nopunc

### Lemmatizer function

In [11]:
from nltk.stem import WordNetLemmatizer
def lema(text):
    word=''
    wordnet_lemmatizer = WordNetLemmatizer()
    tok = nltk.word_tokenize(text)
    for w in tok:
        word=word+wordnet_lemmatizer.lemmatize(w)+' '
    return word.rstrip()

### Applying the Functions

In [12]:
df['text']=df['text'].apply(text_process)

In [13]:
df['text']=df['text'].apply(lema)

In [14]:
df

Unnamed: 0,text,review
0,way plug u unless go converter,0
1,good case excellent value,1
2,great jawbone,1
3,tied charger conversation lasting 45 minutesma...,0
4,mic great,1
...,...,...
995,think food flavor texture lacking,0
996,appetite instantly gone,0
997,overall impressed would go back,0
998,whole experience underwhelming think well go n...,0


## Train Test Split

In [15]:
from sklearn.model_selection import train_test_split
msg_train, msg_test, label_train, label_test = train_test_split(df['text'], df['review'], test_size=0.2)

## Creating a Pipeline

Creating a pipeline here which will automate the process for me. It will pass the count vector to TfidfTransformer and then the model and the model will be trained on that data

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [17]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [18]:
pipeline.fit(msg_train,label_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

## Evaluating the model

In [19]:
pred=pipeline.predict(msg_test)

In [20]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(label_test,pred))
print('\n')
print(classification_report(label_test,pred))

[[218  72]
 [ 37 223]]


              precision    recall  f1-score   support

           0       0.85      0.75      0.80       290
           1       0.76      0.86      0.80       260

    accuracy                           0.80       550
   macro avg       0.81      0.80      0.80       550
weighted avg       0.81      0.80      0.80       550

