In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from kmodes.kprototypes import KPrototypes
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
RANDOM_STATE = 42

In [4]:
pd.set_option('display.max_columns', None)

# predict Cancellation

In [142]:
def format_lenght(date):
    return date[0]
def K_Prototypes(X):
    kproto_4 = KPrototypes(n_clusters=4, init='Huang', verbose=0, random_state=42, n_jobs=-1)
    clusters = kproto_4.fit_predict(X, categorical=[3, 11, 12, 13, 14, 17, 18, 20, 22])
    return clusters
def get_train_data(filename):
    df = pd.read_csv(filename)
    df = df.drop(columns=['ID','adr'])
    df.dropna(subset=['children'], inplace=True)
    df['country']=df['country'].fillna(df['country'].value_counts().index[0]) 
    df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])
    df['hotel'] = np.where(df['hotel']=='City Hotel',1,0)
    df['arrival_date_full'] = df['arrival_date_year'].astype(str) + "-" + df['arrival_date_month'].map({'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}).astype(str) + "-" + df['arrival_date_day_of_month'].astype(str)
    df['arrival_date_full'] = pd.to_datetime(df['arrival_date_full'], format="%Y-%m-%d")
    df['status_minus_arrival_date'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']
    X = df.drop(columns=['is_canceled', 'reservation_status', 'agent', 'company', 'country', 'reservation_status_date', 'arrival_date_full'])
    clusters = K_Prototypes(X)
    df['clusters'] = clusters
    df = pd.get_dummies(df, columns=[ 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type','country'])
    x = df.drop(columns=['is_canceled','arrival_date_year','arrival_date_month','reservation_status', 'agent', 'company', 
                     'reservation_status_date', 'arrival_date_full'])
    y = df['is_canceled']
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, 
                                                    stratify=y, test_size=0.2)
    return x_train, x_test, y_train, y_test
    
    

In [143]:
def get_test_data(filename):
    df = pd.read_csv(filename)
    df = df.drop(columns=['ID'])
    #df['agent'] = df['agent'].fillna('No_Agent')
    #df['company'] = df['company'].fillna('No_Company')
    #df['country'].fillna("unknown", inplace=True)  
    #df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])
    df['country']=df['country'].fillna(df['country'].value_counts().index[0]) 
    df['hotel'] = np.where(df['hotel']=='City Hotel',1,0)
    df['arrival_date_full'] = df['arrival_date_year'].astype(str) + "-" + df['arrival_date_month'].map({'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}).astype(str) + "-" + df['arrival_date_day_of_month'].astype(str)
    df['arrival_date_full'] = pd.to_datetime(df['arrival_date_full'], format="%Y-%m-%d")
    df['status_minus_arrival_date'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']
    X = df.drop(columns=['agent', 'company', 'country', 'arrival_date_full'])
    clusters = K_Prototypes(X)
    df['clusters'] = clusters
    df = pd.get_dummies(df, columns=['meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type','country'])
    x_test = df.drop(columns=['arrival_date_year','arrival_date_month', 'agent', 'company',  'arrival_date_full'])
    return x_test
    

In [144]:
def DTree(x_train, x_test, y_train, y_test):
    dt_params = {
    'max_depth':[None],
    'max_features' : [0.7],
    'min_samples_split': [25], 
    'min_samples_leaf': [1]
    }

    # Perform Grid Search
    dt_gs = GridSearchCV(DecisionTreeClassifier(random_state=42),
                     dt_params,
                     cv = 5,
                     scoring = 'accuracy')
    dt = dt_gs.fit(x_train, y_train)
    
    print(f'Best Training Accuracy: {dt.score(x_train, y_train)}')
    print(f'Best Testing Accuracy: {dt.score(x_test, y_test)}')
    return dt


In [145]:
def missing_col(x_train,x_pred):
    missing_cols = set( x_train.columns ) - set( x_pred.columns )

    for c in missing_cols:
        x_pred[c] = 0

    x_pred = x_pred[x_train.columns]
    return x_pred

In [146]:
x_pred = get_test_data('test.csv')
x_train, x_test, y_train, y_test = get_train_data('train.csv')


In [151]:
len(x_pred.columns)

201

In [152]:
len(x_train.columns)

227

In [153]:
x_pred = missing_col(x_train, x_pred)

In [157]:
clf = DTree(x_train, x_test, y_train, y_test)
y_pred_cancel = clf.predict(x_pred)
print(sum(y_pred_cancel == 1))
print(sum(y_pred_cancel != 1))

Best Training Accuracy: 0.9206238647382582
Best Testing Accuracy: 0.8617939473396701
8764
19095


## predict adr

In [33]:
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [13]:
def K_Prototypes(X):
    kproto_4 = KPrototypes(n_clusters=4, init='Huang', verbose=0, random_state=42, n_jobs=-1)
    clusters = kproto_4.fit_predict(X, categorical=[10,11,12,16,17,19,21])
    return clusters
def get_train_data(filename):
    df = pd.read_csv('train.csv')
    df = df.drop(columns=['ID','arrival_date_year','arrival_date_month','reservation_status', 'agent', 'company', 'reservation_status_date'])
    df.dropna(subset=['children'], inplace=True)
    df['country']=df['country'].fillna(df['country'].value_counts().index[0])
    df['children'] = df['children'].astype(int)   
    df['hotel'] = np.where(df['hotel']=='City Hotel',1,0)
    df['status_minus_arrival_date'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']
    X = df.drop(columns=['country','adr'])
    clusters = K_Prototypes(X)
    df['clusters'] = clusters
    df.sample()
    df.country = df.country.astype("category").cat.codes
    df.meal = df.meal.astype("category").cat.codes
    df.market_segment = df.market_segment.astype("category").cat.codes
    df.distribution_channel = df.distribution_channel.astype("category").cat.codes
    df.reserved_room_type = df.reserved_room_type.astype("category").cat.codes
    df.assigned_room_type = df.assigned_room_type.astype("category").cat.codes
    df.deposit_type = df.deposit_type.astype("category").cat.codes
    df.customer_type = df.customer_type.astype("category").cat.codes
    x = df.drop(columns=['adr'])
    y = df['adr']
    x_train, x_val, y_train, y_val = train_test_split(x, y, random_state=42)
    return x_train, x_val, y_train, y_val
    

In [14]:
x_train_adr, x_val_adr, y_train_adr, y_val_adr = get_train_data('train.csv')

In [23]:
ytrain = np.reshape(y_train_adr.values,(-1,1))
yval = np.reshape(y_val_adr.values,(-1,1))
scaler_x = StandardScaler()
scaler_y = StandardScaler()

print(scaler_x.fit(x_train_adr))
xtrain_scale=scaler_x.transform(x_train_adr)
print(scaler_x.fit(x_val_adr))
xval_scale=scaler_x.transform(x_val_adr)

print(scaler_y.fit(ytrain))
ytrain_scale=scaler_y.transform(ytrain)
print(scaler_y.fit(yval))
yval_scale=scaler_y.transform(yval)

StandardScaler()
StandardScaler()
StandardScaler()
StandardScaler()


In [24]:
xtrain_scale

array([[ 0.7202746 ,  1.33833027, -0.2602869 , ..., -0.68772096,
        -0.52149141,  0.94203986],
       [-1.38835939, -0.74719972, -0.8201925 , ..., -0.68772096,
         0.64266397, -0.31349506],
       [ 0.7202746 ,  1.33833027, -0.45008541, ..., -0.68772096,
        -0.13343962, -0.31349506],
       ...,
       [-1.38835939, -0.74719972, -0.88662198, ...,  0.61678056,
        -0.90954321, -0.31349506],
       [-1.38835939, -0.74719972, -0.62090407, ..., -0.68772096,
         1.41876756, -0.31349506],
       [-1.38835939,  1.33833027, -0.82968243, ..., -0.68772096,
        -0.13343962, -0.31349506]])

In [25]:
ytrain_scale

array([[ 1.26421433],
       [ 1.32734665],
       [ 0.41254559],
       ...,
       [-0.98470233],
       [ 0.63587045],
       [-0.32112519]])

In [34]:
lin = LinearRegression()
clf_lin = lin.fit(xtrain_scale,ytrain_scale)
print(f'Best Training Accuracy: {clf_lin.score(xtrain_scale, ytrain_scale)}')
print(f'Best Testing Accuracy: {clf_lin.score(xval_scale, yval_scale)}')

Best Training Accuracy: 0.3586694381330643
Best Testing Accuracy: -1.0931867407583084e+18


In [26]:
svr = SVR()
m = svr.fit(xtrain_scale, ytrain_scale.ravel())
print(f'Best Training Accuracy: {m.score(xtrain_scale, ytrain_scale)}')
print(f'Best Testing Accuracy: {m.score(xval_scale, yval_scale)}')

Best Training Accuracy: 0.6057606497164391
Best Testing Accuracy: 0.6638397101508127


In [32]:
lr = [6,8,10]
for i in lr:
    svr = SVR(C=i)
    m = svr.fit(xtrain_scale, ytrain_scale.ravel())
    print(f'Best Training Accuracy: {m.score(xtrain_scale, ytrain_scale)}')
    print(f'Best Testing Accuracy: {m.score(xval_scale, yval_scale)}')

Best Training Accuracy: 0.6659949054436183
Best Testing Accuracy: 0.6984091746880481
Best Training Accuracy: 0.6747901526586915
Best Testing Accuracy: 0.701524256712232
Best Training Accuracy: 0.6815888550332583
Best Testing Accuracy: 0.7034247713951591


In [None]:
y_pred = m.predict()

In [174]:
print(f'Best Training Accuracy: {m.score(x_train_adr, y_train_adr)}')
print(f'Best Testing Accuracy: {m.score(x_val_adr, y_val_adr)}')

Best Training Accuracy: 0.16068967614180318
Best Testing Accuracy: 0.19076746803799483


In [29]:
SVR_params = { 
    'gamma': [1e-3, 1e-4],
    'C': [1,10,100,1000],}

svr_gs = GridSearchCV(SVR(),
                 SVR_params,
                 cv = 5,
                 scoring = 'accuracy')
svr_clf = svr_gs.fit(xtrain_scale, ytrain_scale.ravel())
print(f'Best Training Accuracy: {svr_clf.score(xtrain_scale, ytrain_scale)}')
print(f'Best Testing Accuracy: {svr_clf.score(xval_scale, yval_scale)}')

ValueError: continuous is not supported

In [None]:
print(f'Best Training Accuracy: {lg.score(x_train_adr, y_train_adr)}')
print(f'Best Testing Accuracy: {lg.score(x_val_adr, y_val_adr)}')