## Data Wranging

In [1]:
import pandas as pd

In [2]:
# we load the dataset into a DF
df = pd.read_csv('data/DisneylandReviews.csv', sep=',', encoding='latin-1')

In [3]:
df

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong
...,...,...,...,...,...,...
42651,1765031,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...,Disneyland_Paris
42652,1659553,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...,Disneyland_Paris
42653,1645894,5,missing,South Africa,My eleven year old daughter and myself went to...,Disneyland_Paris
42654,1618637,4,missing,United States,"This hotel, part of the Disneyland Paris compl...",Disneyland_Paris


In [4]:
# We create the col 'Year' by using Regex in the 'Year-Month' col
df['Year'] = df['Year_Month'].str.extract('(\d{4})', expand=True)

In [5]:
# 2613 NaNs in the new col, not very significative, around 6% of total rows 
df['Year'].isna().sum()

2613

In [6]:
# They appeared because there was 2613 'missing' values in the original 
# df col 'Year_Month', so these NaNs were to be expected
len(df[df['Year_Month'] == "missing"])

2613

In [7]:
# We create the col 'Month' by using Regex in the 'Year-Month' col
df['Month'] = df['Year_Month'].str.extract('(\d+(?!.*-))', expand=True)

In [8]:
# Again, 2613 NaNs in the new col
df['Month'].isna().sum()

2613

In [9]:
# col 'Year' is type object, we convert it to float
df.dtypes

Review_ID             int64
Rating                int64
Year_Month           object
Reviewer_Location    object
Review_Text          object
Branch               object
Year                 object
Month                object
dtype: object

In [10]:
df['Year'] = df['Year'].astype(float)

In [11]:
df.sample(3)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Year,Month
26752,129393528,5,2011-12,Australia,Lived up to our expectations and more. Really ...,Disneyland_California,2011.0,12
17546,313802869,4,2015-4,United States,"I grew up in Florida, so Disney World was prac...",Disneyland_California,2015.0,4
8521,142758318,3,2012-9,Malaysia,Visiting to Universal Studio Gold Coast and fe...,Disneyland_HongKong,2012.0,9


In [12]:
# Then we change the month int to month name. The abbreviated month names is the first 
# three letters of their full names, so we first convert the Month column to datetime 
# and then use dt.month_name() to get the full month name and finally use str.slice() 
# method to get the first three letters
import calendar
df['Month'] = df['Month'] = pd.to_datetime(df['Month'], format='%m').dt.month_name().str.slice(stop=3)

In [13]:
df.sample(5)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Year,Month
21582,216405834,4,2013-12,United States,We had a wonderful time turning back the clock...,Disneyland_California,2013.0,Dec
39124,176749415,4,2013-9,United Kingdom,They are soooo rude !!! What it is with some p...,Disneyland_Paris,2013.0,Sep
28523,87410549,3,missing,United States,I live in Southern California and bought a pre...,Disneyland_California,,
18718,284121455,5,2014-9,New Zealand,"This was my Third Visit, (last one was in the ...",Disneyland_California,2014.0,Sep
34706,339176205,4,2015-8,Canada,This place was crazy! Lots and lots of people ...,Disneyland_Paris,2015.0,Aug


## Export clean csv 

The newly added 'Year' and 'Month' columns will be useful for the analysis later on. Therefore, we proceed to export the more complete dataset to a csv file. Then, we will use this file to seed a database with its info on SQL. 

In [202]:
df.to_csv("disney_reviews_clean.csv")

## Export to SQL using sqlalchemy 

In [20]:
import pandas as pd
import sqlalchemy as alch
import os

# Establishing connection

password = "password"
dbName = "disney"
connectionData = f"mysql+pymysql://root:{password}@127.0.0.1/{dbName}"
engine = alch.create_engine(connectionData, pool_pre_ping=True)

# Loading to the new info
table = "disneyland_reviews"
df = pd.read_csv("data/disney_reviews_clean.csv")
df.to_sql(table, con=engine)

42656

In [9]:
df['Branch'].value_counts()

Disneyland_California    19406
Disneyland_Paris         13630
Disneyland_HongKong       9620
Name: Branch, dtype: int64