### Data visualizer to explore data ranges and experiment with data values

![Greedybusiness](/home/jeffreymo572/Kaggles/common/images/Monopoly.jpg)

In [1]:
# Importing
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sn
import holidays

# Dataset
data_dir = "~/Kaggles/data/S3E7/train.csv"
df = pd.read_csv(data_dir)

In [None]:
# Column info
df.info()

In [None]:
df.head(10)

In [2]:
# Renaming columns to be more readable
# Current unknowns: market_segment_type, 
df.columns = ["id", 'num_adults', 'num_children', 'num_weekend_nights', 'num_week_nights', 'meal_plan', 
              'parking', 'room_type', 'lead_time', 'year', 'month', 'date', 'market_segment_type', 'repeated_customer',
              'num_prev_cancellations', 'num_prev_not_cancelled', 'avg_price_per_room', 'num_special_requests',
              'booking_status']

for i, value in enumerate(df.columns.tolist()):
    print(f"{i}: {value}")

0: id
1: num_adults
2: num_children
3: num_weekend_nights
4: num_week_nights
5: meal_plan
6: parking
7: room_type
8: lead_time
9: year
10: month
11: date
12: market_segment_type
13: repeated_customer
14: num_prev_cancellations
15: num_prev_not_cancelled
16: avg_price_per_room
17: num_special_requests
18: booking_status


#### Possible Notes for cleaning
##### Additions
* Insert ratio of cancellations at index 16 
    * `num_prev_cancellations/(num_prev_not_cancelled+avg_price_per_room)`
* Is weekend
    * `pd.Timestamp(f"{year}-{month:.2f}-{day:.2f}").dayofweek`
    * Note: Monday is 0
* Is holiday
    * Might have to hard code: https://en.wikipedia.org/wiki/Federal_holidays_in_the_United_States
* Total people
    * `num_adults+num_children`
##### Removals
* ID (done)
* date -> weekday & holiday
##### Modifications
* Month/Year/Date into one column?
    * Maybe not since month/year may have seasonal impact on cancellation

##### Questionable
* Parking
* meal_plan

# KNOWN CORRELATIONS
**PEOPLE WHO ORDER MEAL PLAN 3 HAVE A 83% CHANCE TO CANCEL!!!** \
**PEOPLE WITH A BOOKING STATUS OF 0 WILL CANCEL!!!!**

In [3]:
# Information about data 
# Mean, median, mode, variance, std, etc.
df_info = pd.DataFrame()

df_info['var'] = df.var()
df_info['mean'] = df.mean()
df_info['std'] = df.std()
df_info['var/mean'] = df.var()/df.mean()
df_info['std/mean'] = df.std()/df.mean()

df_info.reset_index(inplace=True)
df_info.columns = ['category', 'var', 'mean', 'std', 'var/mean', 'std/mean']

# Dropping and replacing values
df_info = df_info.drop(0)

df_info

Unnamed: 0,category,var,mean,std,var/mean,std/mean
1,num_adults,0.275573,1.920713,0.52495,0.143474,0.27331
2,num_children,0.202615,0.141093,0.450128,1.436046,3.190303
3,num_weekend_nights,0.784452,0.884632,0.885693,0.886756,1.0012
4,num_week_nights,2.03727,2.398005,1.42733,0.849569,0.595216
5,meal_plan,0.345361,0.239192,0.587674,1.443861,2.456908
6,parking,0.024612,0.025249,0.156884,0.974774,6.213357
7,room_type,0.693232,0.428931,0.832605,1.616184,1.941117
8,lead_time,6572.238347,103.888029,81.069343,63.262711,0.780353
9,year,0.123057,2017.856295,0.350795,6.1e-05,0.000174
10,month,8.005479,7.593539,2.829395,1.054249,0.372606


In [7]:
df['norm_adults'] = df["num_adults"]/max(df['num_adults'])
df

Unnamed: 0,id,num_adults,num_children,num_weekend_nights,num_week_nights,meal_plan,parking,room_type,lead_time,year,month,date,market_segment_type,repeated_customer,num_prev_cancellations,num_prev_not_cancelled,avg_price_per_room,num_special_requests,booking_status,norm_adults
0,0,2,0,0,2,1,0,0,9,2018,1,14,1,1,11,0,67.50,0,0,0.50
1,1,2,0,1,2,0,0,0,117,2018,7,29,0,0,0,0,72.25,0,0,0.50
2,2,2,0,0,1,0,0,0,315,2018,12,2,0,0,0,0,52.00,0,0,0.50
3,3,1,0,0,2,1,0,0,32,2018,12,1,1,0,0,0,56.00,0,0,0.25
4,4,2,0,1,0,0,0,0,258,2018,10,16,0,0,0,0,100.00,0,1,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42095,42095,3,0,0,4,0,0,1,160,2018,12,30,1,0,0,0,140.00,2,1,0.75
42096,42096,2,0,0,3,0,0,0,34,2017,9,23,0,0,0,0,224.67,0,0,0.50
42097,42097,2,0,0,2,2,0,0,292,2018,7,21,0,0,0,0,96.00,0,0,0.50
42098,42098,1,0,0,3,0,0,0,5,2018,11,9,0,0,0,0,120.00,0,0,0.25


In [3]:
# Holiday checker
month = df['month']
year = df['year']
day = df['date']
us_holidays = holidays.UnitedStates()
df['is_holiday'] = f"{year}-{month}-{day}" in us_holidays

ValueError: Cannot parse date from string '0        2018
1        2018
2        2018
3        2018
4        2018
         ... 
42095    2018
42096    2017
42097    2018
42098    2018
42099    2017
Name: year, Length: 42100, dtype: int64-0         1
1         7
2        12
3        12
4        10
         ..
42095    12
42096     9
42097     7
42098    11
42099    10
Name: month, Length: 42100, dtype: int64-0        14
1        29
2         2
3         1
4        16
         ..
42095    30
42096    23
42097    21
42098     9
42099    26
Name: date, Length: 42100, dtype: int64'