# BUILDING A MACHINE LEARNING MODEL OF THE DISNEYLAND TOUR'S 
# REVIEWS

## importing necessary libraries

In [3]:
# importing the necessary libraries
from sklearn.metrics import f1_score,accuracy_score,classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## reading the csv file

In [None]:
# reading the file...
file = pd.read_csv('DisneylandReviews.csv')
files = file.copy()
file.head()

In [None]:
# for this particular model we need only the reviews and the ratings and so we will reduce our file to just the two
file = file[['Review_Text','Rating']]
file.head()

In [None]:
# now we need to create a sentiment class from our rating to keep our file clean...
# to do that lets change our file into a JSONlike file by putting it in a list of dictionary..
#lets make use of a for-loop

# lets change the individual columns into a list
Ratings = list(file['Rating'])
Reviews = list(file['Review_Text'])
new_file = []
for ratings,reviews in zip(Ratings,Reviews):
    List = {'Reviews':reviews,'Rating':ratings}
    new_file.append(List)
    
new_file[:2]

In [None]:
# now lets create the class ...


class SENTIMENT:
    POSITIVE = 'POSITIVE'
    NEGATIVE = 'NEGATIVE'
    #positive sentiments = ratings 1 and 2
    #negative sentiments = ratings > 2
    
#    
class Review:
    def __init__(self,Reviews,Rating):
        self.Reviews = Reviews
        self.Rating = Rating
        self.Sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if (self.Rating == 1 or self.Rating == 2):
            return SENTIMENT.NEGATIVE
        else:
            return SENTIMENT.POSITIVE   
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_reviews(self):
        return [x.Reviews for x in self.reviews]
    def get_sentiment(self):
        return [x.Sentiment for x in self.reviews]
    
# this method evenly distributes between the positive and negative reviews    
    def EvenlyDistribute(self):
        positive = list(filter(lambda x:x.Sentiment==SENTIMENT.POSITIVE,self.reviews))
        negative = list(filter(lambda x:x.Sentiment==SENTIMENT.NEGATIVE,self.reviews))
        positive_shrunked = positive[:len(negative)]
        self.reviews = negative + positive_shrunked      

In [None]:
Disney_file = []
for lists in new_file:
    new_file_list = Review(lists['Reviews'],lists['Rating'])
    Disney_file.append(new_file_list)
Disney_file[0].Reviews

In [None]:
# lets build or model
# lets split our Disney_file

train,test = train_test_split(Disney_file,test_size=0.3,random_state=21)
# lets split both our train and test into xs and ys
# lets make use of our class
train_cont = ReviewContainer(train)
test_cont = ReviewContainer(test)

train_cont.EvenlyDistribute()
test_cont.EvenlyDistribute()

x_train = train_cont.get_reviews()
y_train = train_cont.get_sentiment()

x_test = test_cont.get_reviews()
y_test = test_cont.get_sentiment()

In [None]:
# now lets break our datasets' reviews using the count_vectorizer
# basically what count_vectorizer does in break each words down and changes them into a machine readable text
# this little process is what is called BagOfWords

count_Vec = CountVectorizer()
x_train_vect = count_Vec.fit_transform(x_train)
x_test_vect = count_Vec.transform(x_test)

In [None]:
# using DecisionTree classifier
clf_dt = DecisionTreeClassifier()
clf_dt.fit(x_train_vect,y_train)
clf_dt.predict(x_test_vect)
print('\nDECISION TREE CLASSIFICATION\n\n')
print('accuracy score:\n',accuracy_score(y_test,clf_dt.predict(x_test_vect)))
print('classification reports:\n',classification_report(y_test,clf_dt.predict(x_test_vect)))

In [None]:
svc_classifier = SVC()
svc_classifier.fit(x_train_vect,y_train)
svc_classifier.predict(x_test_vect)
print('\nSVC\n\n')
print('accuracy score:\n',accuracy_score(y_test,svc_classifier.predict(x_test_vect)))
print('classification reports:\n',classification_report(y_test,svc_classifier.predict(x_test_vect)))

In [None]:
# lets check our f1_score
from sklearn.metrics import f1_score
print(f1_score(y_test,clf_dt.predict(x_test_vect),average=None,labels=(SENTIMENT.POSITIVE,SENTIMENT.NEGATIVE)))
print('/n')
print(f1_score(y_test,svc.predict(x_test_vect),average=None,labels=(SENTIMENT.POSITIVE,SENTIMENT.NEGATIVE)))

In [None]:
# saving our model using pickle

import pickle

with open('disney.pkl','wb') as myfile:
    pickle.dump(clf_dt,myfile)

with open('disney.pkl','rb') as myfile:
    model = pickle.load(myfile)

In [None]:
testing = ['the tour is a nice one .. i really enjoyed it.','the tour is a bad one, no improvement at all.']
s = count_Vec.transform(testing)
model.predict(s)

## Visualization

## Now lets plot a graph using matplotlib

In [None]:
# Lets plot the graph showing the Disneyland Branch with the highest number of Positive reviews

In [None]:
files
hongkong = files.loc[files['Branch']=='Disneyland_HongKong']
hongkong = hongkong.loc[hongkong.Rating>=4]['Rating'].count()
california = files.loc[files['Branch']=='Disneyland_California']
california = california.loc[california.Rating>=4]['Rating'].count()
paris = files.loc[files['Branch']=='Disneyland_Paris']
paris = paris.loc[paris.Rating>=4]['Rating'].count()
plt.style.use('ggplot')
plt.figure(figsize=(15,5))
label = ['hongkong','california','paris']
plt.pie([hongkong,california,paris],
        autopct='%.2f %%',
        labels=label,explode=[.1,.15,.2],
        pctdistance=.5)
plt.title('Disneyland Branch with the highest number of POSITIVE Reviews',fontdict={'fontname':'monospace','fontsize':15})
plt.show()