In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import re
import nltk # NLP toolkit
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import string
from collections import Counter

# Download NLTK resources if not already downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')

import warnings
warnings.simplefilter('ignore', category=Warning, lineno=0, append=False)

In [68]:
url='../data/DisneylandReviews.csv'
df = pd.read_csv(url,encoding="cp1252")
df

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong
...,...,...,...,...,...,...
42651,1765031,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...,Disneyland_Paris
42652,1659553,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...,Disneyland_Paris
42653,1645894,5,missing,South Africa,My eleven year old daughter and myself went to...,Disneyland_Paris
42654,1618637,4,missing,United States,"This hotel, part of the Disneyland Paris compl...",Disneyland_Paris


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42656 entries, 0 to 42655
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          42656 non-null  int64 
 1   Rating             42656 non-null  int64 
 2   Year_Month         42656 non-null  object
 3   Reviewer_Location  42656 non-null  object
 4   Review_Text        42656 non-null  object
 5   Branch             42656 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.0+ MB


In [70]:
df.drop(columns = 'Review_ID', inplace = True)
df

Unnamed: 0,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong
...,...,...,...,...,...
42651,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...,Disneyland_Paris
42652,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...,Disneyland_Paris
42653,5,missing,South Africa,My eleven year old daughter and myself went to...,Disneyland_Paris
42654,4,missing,United States,"This hotel, part of the Disneyland Paris compl...",Disneyland_Paris


In [71]:
empty_values = df.isna().sum()
print(empty_values)

spaces_values = (df == ' ').sum()
print(spaces_values)

nan_values = df.isnull().sum()
print(nan_values)

Rating               0
Year_Month           0
Reviewer_Location    0
Review_Text          0
Branch               0
dtype: int64
Rating               0
Year_Month           0
Reviewer_Location    0
Review_Text          0
Branch               0
dtype: int64
Rating               0
Year_Month           0
Reviewer_Location    0
Review_Text          0
Branch               0
dtype: int64


In [72]:
df[df['Year_Month'] == 'missing']

Unnamed: 0,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
269,4,missing,Philippines,The first thing on our agenda when we finished...,Disneyland_HongKong
282,3,missing,Singapore,Brought mum for the first time to Disneyland w...,Disneyland_HongKong
622,3,missing,Canada,I have been to Tokyo and LA Disneyland!I also ...,Disneyland_HongKong
5347,4,missing,Australia,We pre bought tickets at the hotel (same price...,Disneyland_HongKong
5799,5,missing,Philippines,Disneyland is indeed the most magical place in...,Disneyland_HongKong
...,...,...,...,...,...
42651,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...,Disneyland_Paris
42652,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...,Disneyland_Paris
42653,5,missing,South Africa,My eleven year old daughter and myself went to...,Disneyland_Paris
42654,4,missing,United States,"This hotel, part of the Disneyland Paris compl...",Disneyland_Paris


In [73]:
# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [74]:
# Preprocess the data
data_series_preprocessed = df['Review_Text'].apply(preprocess_text)
data_series_preprocessed

0        youve ever disneyland anywhere youll find disn...
1        since last time visit hk disneyland yet time s...
2        thanks god hot humid visiting park otherwise w...
3        hk disneyland great compact park unfortunately...
4        location city took around 1 hour kowlon kids l...
                               ...                        
42651    went disneyland paris july 03 thought brillian...
42652    2 adults 1 child 11 visited disneyland paris b...
42653    eleven year old daughter went visit son london...
42654    hotel part disneyland paris complex wonderful ...
42655    went disneyparis resort 1996 small child minut...
Name: Review_Text, Length: 42656, dtype: object

In [82]:
# Initialize sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Perform sentiment analysis
sentiments = data_series_preprocessed.apply(
    lambda x: sid.polarity_scores(x)['compound']
)
sentiments

0        0.7889
1        0.9851
2        0.9859
3        0.8739
4        0.5106
          ...  
42651    0.9522
42652    0.9867
42653    0.8402
42654    0.9517
42655    0.9815
Name: Review_Text, Length: 42656, dtype: float64

In [88]:
# Classify sentiment as positive or negative based on compound score
sentiment_class = sentiments.apply(
    lambda x: 'positive' if x > 0 else ('neutral' if x == 0 else 'negative')
) 

In [90]:
# Add sentiment scores to the DataFrame
df_with_sentiment = pd.DataFrame({
    'Review': df['Review_Text'], 
     'Sentiment': sentiment_class
})

# Display the DataFrame with sentiment scores
df_with_sentiment

Unnamed: 0,Review,Sentiment
0,If you've ever been to Disneyland anywhere you...,positive
1,Its been a while since d last time we visit HK...,positive
2,Thanks God it wasn t too hot or too humid wh...,positive
3,HK Disneyland is a great compact park. Unfortu...,positive
4,"the location is not in the city, took around 1...",positive
...,...,...
42651,i went to disneyland paris in july 03 and thou...,positive
42652,2 adults and 1 child of 11 visited Disneyland ...,positive
42653,My eleven year old daughter and myself went to...,positive
42654,"This hotel, part of the Disneyland Paris compl...",positive


In [99]:
num_negative = df_with_sentiment[df_with_sentiment['Sentiment'] == 'negative'].shape[0]
num_positive = df_with_sentiment[df_with_sentiment['Sentiment'] == 'positive'].shape[0]
num_neutral = df_with_sentiment[df_with_sentiment['Sentiment'] == 'neutral'].shape[0]

print(num_negative)
print(num_positive)
print(num_neutral)

3313
38825
518


In [106]:
df_with_sentiment.to_csv('../data/test.csv', index=False)

In [121]:
dataset = pd.read_csv('../data/test.csv')
dataset

Unnamed: 0,Review,Sentiment
0,If you've ever been to Disneyland anywhere you...,positive
1,Its been a while since d last time we visit HK...,positive
2,Thanks God it wasn t too hot or too humid wh...,positive
3,HK Disneyland is a great compact park. Unfortu...,positive
4,"the location is not in the city, took around 1...",positive
...,...,...
42651,i went to disneyland paris in july 03 and thou...,positive
42652,2 adults and 1 child of 11 visited Disneyland ...,positive
42653,My eleven year old daughter and myself went to...,positive
42654,"This hotel, part of the Disneyland Paris compl...",positive


In [133]:
dataset['Sentiment'] = [1 if val == 'positive' else 0 for val in dataset['Sentiment']]

dataset

Unnamed: 0,Review,Sentiment
0,If you've ever been to Disneyland anywhere you...,1
1,Its been a while since d last time we visit HK...,1
2,Thanks God it wasn t too hot or too humid wh...,1
3,HK Disneyland is a great compact park. Unfortu...,1
4,"the location is not in the city, took around 1...",1
...,...,...
42651,i went to disneyland paris in july 03 and thou...,1
42652,2 adults and 1 child of 11 visited Disneyland ...,1
42653,My eleven year old daughter and myself went to...,1
42654,"This hotel, part of the Disneyland Paris compl...",1


In [148]:
from sklearn.model_selection import train_test_split

sentences = dataset['Review'].values
y = dataset['Sentiment'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=20, random_state=1000)


In [150]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train


<42636x42547 sparse matrix of type '<class 'numpy.int64'>'
	with 3382871 stored elements in Compressed Sparse Row format>

In [151]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 1.0
