In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Analysis

In [3]:
!ls

 Sentiment Analysis.ipynb               attempt 1.ipynb
Disneyland Reviews Classification.ipynb rating_year.jpg
DisneylandReviews.csv                   rating_year_stacked_bar_chart.png
EDA and Word Clouds.ipynb               top_10_reviewer_locations.png
More EDA.ipynb


In [4]:
reviews = pd.read_csv("DisneylandReviews.csv", encoding='latin1')

In [5]:
reviews.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [6]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42656 entries, 0 to 42655
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          42656 non-null  int64 
 1   Rating             42656 non-null  int64 
 2   Year_Month         42656 non-null  object
 3   Reviewer_Location  42656 non-null  object
 4   Review_Text        42656 non-null  object
 5   Branch             42656 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.0+ MB


In [7]:
#checking for duplicates
reviews["Review_ID"].value_counts()

121586148    2
164830205    2
129231609    2
168489234    2
226905150    2
            ..
428912320    1
428910029    1
428894903    1
428884275    1
1536786      1
Name: Review_ID, Length: 42636, dtype: int64

In [8]:
#dropping duplicates
reviews.drop_duplicates(subset=["Review_ID"], inplace = True, keep = "first")
reviews["Year_Month"] = pd.to_datetime(reviews["Year_Month"], errors = "coerce")

In [9]:
#creating separate year and month columns
reviews["Year"] = reviews["Year_Month"].dt.year
reviews["Month"] = reviews["Year_Month"].dt.month

In [10]:
reviews.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Year,Month
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,2019.0,4.0
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,2019.0,5.0
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,2019.0,4.0
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,2019.0,4.0
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,2019.0,4.0


In [11]:
#checking for number of missing values in each column
reviews.isnull().sum()

Review_ID               0
Rating                  0
Year_Month           2613
Reviewer_Location       0
Review_Text             0
Branch                  0
Year                 2613
Month                2613
dtype: int64

In [12]:
#dropping missing values
reviews.dropna(inplace=True)

In [13]:
reviews.shape

(40023, 8)

In [14]:
#maximum reviews for year
reviews["Year"].value_counts().sort_values(ascending=False)

2015.0    6979
2016.0    6599
2014.0    5300
2017.0    5195
2013.0    4709
2012.0    4339
2018.0    3997
2011.0    1976
2019.0     786
2010.0     143
Name: Year, dtype: int64

In [15]:
#maximum reviews for month
reviews["Month"].value_counts().sort_values(ascending=False)

8.0     3994
7.0     3876
12.0    3870
10.0    3764
6.0     3588
4.0     3476
5.0     3436
9.0     3229
3.0     3132
11.0    2684
1.0     2515
2.0     2459
Name: Month, dtype: int64

In [16]:
#from which country there is the maximym reviews
reviews["Reviewer_Location"].value_counts().sort_values(ascending=False)

United States     13513
United Kingdom     9115
Australia          4409
Canada             2115
India              1469
                  ...  
El Salvador           1
Grenada               1
Senegal               1
Madagascar            1
Andorra               1
Name: Reviewer_Location, Length: 162, dtype: int64

In [17]:
#from which country there is the maximym review, top 10 countries
reviews["Reviewer_Location"].value_counts().sort_values(ascending=False)[:10]

United States     13513
United Kingdom     9115
Australia          4409
Canada             2115
India              1469
Philippines        1024
Singapore           968
New Zealand         714
Malaysia            560
Hong Kong           515
Name: Reviewer_Location, dtype: int64

In [18]:
#number of reviews for each branch
reviews["Branch"].value_counts().sort_values(ascending=False)

Disneyland_California    18196
Disneyland_Paris         12693
Disneyland_HongKong       9134
Name: Branch, dtype: int64

In [19]:
#rating distribution
reviews["Rating"].value_counts().sort_values(ascending=False)

5    21899
4    10079
3     4778
2     1929
1     1338
Name: Rating, dtype: int64

In [20]:
#mean rating
reviews["Rating"].mean()

4.231092122029833

In [21]:
#top 5 countries that gave the best rating
reviews.groupby("Reviewer_Location")["Rating"].mean().sort_values(ascending=False)[:5]

Reviewer_Location
Libya                    5.0
Caribbean Netherlands    5.0
Cuba                     5.0
Curaçao                  5.0
Ethiopia                 5.0
Name: Rating, dtype: float64

In [22]:
#top 5 countries that gave the lowest rating
reviews.groupby("Reviewer_Location")["Rating"].mean().sort_values(ascending=True)[:5]

Reviewer_Location
Andorra                     2.000000
Turks and Caicos Islands    2.000000
South Sudan                 2.000000
Suriname                    2.000000
Ecuador                     2.333333
Name: Rating, dtype: float64

In [23]:
#which is best branch
reviews.groupby("Branch")["Rating"].mean()

Branch
Disneyland_California    4.413717
Disneyland_HongKong      4.215568
Disneyland_Paris         3.980462
Name: Rating, dtype: float64

In [24]:
#Rating distributions percentages per branch 

In [25]:
#California
reviews[reviews["Branch"]=="Disneyland_California"]["Rating"].value_counts()*100/len(reviews[reviews["Branch"]=="Disneyland_California"])

5    64.843922
4    20.394592
3     8.523851
2     3.764564
1     2.473071
Name: Rating, dtype: float64

In [26]:
#HongKong
reviews[reviews["Branch"]=="Disneyland_HongKong"]["Rating"].value_counts()*100/len(reviews[reviews["Branch"]=="Disneyland_HongKong"])

5    47.438143
4    33.369827
3    14.166849
2     3.361069
1     1.664112
Name: Rating, dtype: float64

In [27]:
#Paris
reviews[reviews["Branch"]=="Disneyland_Paris"]["Rating"].value_counts()*100/len(reviews[reviews["Branch"]=="Disneyland_Paris"])

5    45.434491
4    26.156149
3    15.228866
2     7.382022
1     5.798472
Name: Rating, dtype: float64

# Text Reviews

In [28]:
reviews.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Year,Month
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,2019.0,4.0
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,2019.0,5.0
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,2019.0,4.0
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,2019.0,4.0
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,2019.0,4.0


In [29]:
reviews["Review_Text"][0]

"If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides  its a Small World  is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well. "

In [30]:
#isolating the text reviews and ratings, I am going to include the Review_ID

In [31]:
data = reviews[["Review_ID","Review_Text","Rating"]]

In [32]:
data.head()

Unnamed: 0,Review_ID,Review_Text,Rating
0,670772142,If you've ever been to Disneyland anywhere you...,4
1,670682799,Its been a while since d last time we visit HK...,4
2,670623270,Thanks God it wasn t too hot or too humid wh...,4
3,670607911,HK Disneyland is a great compact park. Unfortu...,4
4,670607296,"the location is not in the city, took around 1...",4


# 

In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.6/197.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl (4.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
Installing collected packages: tokenizers, pyyaml, huggingface-hub, transform

In [33]:
pip install torch torchvision torchaudio

Collecting torchvision
  Downloading torchvision-0.15.1-cp310-cp310-macosx_10_9_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting torchaudio
  Downloading torchaudio-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torchvision, torchaudio
Successfully installed torchaudio-2.0.1 torchvision-0.15.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [34]:
pip show torch

Name: torch
Version: 2.0.0
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages
Requires: filelock, jinja2, networkx, sympy, typing-extensions
Required-by: stanza, torchaudio, torchvision
Note: you may need to restart the kernel to use updated packages.


In [35]:
pip install --upgrade pip

Collecting pip
  Downloading pip-23.1.2-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.3
    Uninstalling pip-22.3:
      Successfully uninstalled pip-22.3
Successfully installed pip-23.1.2
Note: you may need to restart the kernel to use updated packages.


In [36]:
from transformers import pipeline

In [37]:
import torch
import torch.nn.functional as F

In [38]:
#using pipline is a great and easy way to use hugging face models
#this is the default hugging face sentament analysis model 
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [39]:
result1 = classifier(reviews["Review_Text"][0])
print(result1)

[{'label': 'POSITIVE', 'score': 0.999860405921936}]


In [40]:
reviews["Review_Text"][0]

"If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides  its a Small World  is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well. "

In [41]:
result2 = classifier(reviews["Review_Text"][1000])
print(result2)

[{'label': 'NEGATIVE', 'score': 0.8825691342353821}]


In [42]:
reviews["Review_Text"][1000]

"I thought this is the happiest and coolest place on earth, I was wrong. There is limited numbers of rides to choose. they don't offer extreme rides unlike any other theme park. The castle is on going rehabilitation. The parade was good, but if you want and prefer to all rides you will not be satisfied to HK Disneyland."

In [49]:
# data["sentiment"] = ""
# for i, row in reviews.iterrows():
#     text = row["Review_Text"]
#     results = classifier(text)
#     sentiment_label = results[0]["label"]
#     reviews.at[i, "sentiment"] = sentiment_label
# reviews.to_csv("DisneylandReviews.csv", index =False)

In [43]:
# create the sentiment classification pipeline
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

for i, row in reviews.iterrows():
    text = row["Review_Text"]
    results = classifier(text, truncation=True)
    sentiment_label = results[0]["label"]
    reviews.at[i, "sentiment"] = sentiment_label

KeyboardInterrupt: 

In [None]:
batch_size = 16  # Adjust the batch size according to your hardware capabilities
num_batches = len(reviews) // batch_size + 1

for i in range(num_batches):
    start_index = i * batch_size
    end_index = (i + 1) * batch_size
    batch_reviews = reviews.iloc[start_index:end_index]["Review_Text"].tolist()
    results = classifier(batch_reviews, truncation=True)
    
    for j, result in enumerate(results):
        sentiment_label = result["label"]
        reviews.at[start_index + j, "sentiment"] = sentiment_label

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english", device=device.index)


In [51]:
#trying just 5 rows
# create the sentiment classification pipeline
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

for i, row in reviews[:5].iterrows():
    text = row["Review_Text"]
    results = classifier(text, truncation=True)[0]
    sentiment_label = results["label"]
    sentiment_score = results["score"]
    reviews.at[i, "sentiment"] = sentiment_label
    reviews.at[i, "score"] = sentiment_score


In [52]:
reviews.head(5)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Year,Month,sentiment,score
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,2019.0,4.0,POSITIVE,0.99986
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,2019.0,5.0,POSITIVE,0.53664
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,2019.0,4.0,POSITIVE,0.986923
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,2019.0,4.0,POSITIVE,0.958155
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,2019.0,4.0,POSITIVE,0.99924


In [54]:
# reviews.drop(columns=["sentiment_HF"], inplace=True)
#idk where this column came from or what it does

In [55]:
reviews.head(5)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Year,Month,sentiment,score
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,2019.0,4.0,POSITIVE,0.99986
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,2019.0,5.0,POSITIVE,0.53664
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,2019.0,4.0,POSITIVE,0.986923
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,2019.0,4.0,POSITIVE,0.958155
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,2019.0,4.0,POSITIVE,0.99924


In [1]:
#trying again for the whole dataset
# create the sentiment analysis pipeline
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

# iterate over the rows and update the sentiment_score column
for i, row in reviews.iterrows():
    text = row["Review_Text"]
    results = classifier(text, truncation=True)
    sentiment_score = results[0]["score"]
    reviews.at[i, "sentiment_score"] = sentiment_score


NameError: name 'pipeline' is not defined