In [11]:
import pandas as pd
import numpy as np
import nltk
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [12]:
df = pd.read_csv("C:\\Users\\risha\\Downloads\\archive (2)\\tripadvisor_hotel_reviews.csv",encoding='ISO-8859-1')

In [13]:
df.head(5)

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [15]:
df.shape

(20491, 2)

In [16]:
df.isnull().sum()

Review    0
Rating    0
dtype: int64

In [17]:
df["Rating"].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

In [18]:
df.loc[df["Review"] == ""]

Unnamed: 0,Review,Rating


In [19]:
df.groupby('Review')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002B168A07100>

In [20]:
pos = [5,4,3]
neg = [1, 2]

def sentiment(rating):
  if rating in pos:
    return "POSITIVE"
  else:
    return "NEGATIVE"  
df['Sentiment'] = df['Rating'].apply(sentiment)
df.head()

Unnamed: 0,Review,Rating,Sentiment
0,nice hotel expensive parking got good deal sta...,4,POSITIVE
1,ok nothing special charge diamond member hilto...,2,NEGATIVE
2,nice rooms not 4* experience hotel monaco seat...,3,POSITIVE
3,"unique, great stay, wonderful time hotel monac...",5,POSITIVE
4,"great stay great stay, went seahawk game aweso...",5,POSITIVE


In [21]:
from nltk.corpus import stopwords
stopwords_list = set(stopwords.words("english"))
punctuations = """"!()-![]{};:,+'"\,<>./?@#$%^&*_~Â""" #List of punctuation to remove

def reviewParse(review):
    splitReview = review.split() #Split the review into words
    parsedReview = " ".join([word.translate(str.maketrans('', '', punctuations)) + " " for word in splitReview]) #Takes the stubborn punctuation out
    return parsedReview #Returns the parsed review
  
def clean_review(review):
    clean_words = []
    splitReview = review.split()
    for w in splitReview:
        if w.isalpha() and w not in stopwords_list:
            clean_words.append(w.lower())
    clean_review = " ".join(clean_words)
    return clean_review

df["Review"] = df["Review"].apply(reviewParse).apply(clean_review) #Parse all the reviews for their punctuation and add it into a new column

df.head() #Take a peek at the datasett

Unnamed: 0,Review,Rating,Sentiment
0,nice hotel expensive parking got good deal sta...,4,POSITIVE
1,ok nothing special charge diamond member hilto...,2,NEGATIVE
2,nice rooms experience hotel monaco seattle goo...,3,POSITIVE
3,unique great stay wonderful time hotel monaco ...,5,POSITIVE
4,great stay great stay went seahawk game awesom...,5,POSITIVE


In [22]:
y = df["Sentiment"]
X= df["Review"]

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [25]:
vectorizer=TfidfVectorizer() 
tfidf = vectorizer.fit_transform(X)

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(tfidf, y)

In [36]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

classifier = LogisticRegression(max_iter=300,random_state=123)
model =classifier.fit(X_train, Y_train)

In [37]:
Y1_pred_class= classifier.predict(X_test)
Y1_pred_class

array(['POSITIVE', 'POSITIVE', 'POSITIVE', ..., 'POSITIVE', 'POSITIVE',
       'POSITIVE'], dtype=object)

In [38]:
test = vectorizer.transform(["good"])
model.predict(test)

array(['POSITIVE'], dtype=object)

In [39]:
import pickle

In [48]:
pickle_out = { "vectorizer": vectorizer ,
          "model" : classifier}
pickle.dump(pickle_out , open("classifier.pkl","wb"))   