In [175]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [242]:
train_df = pd.read_csv('train.csv')
train_df.columns


Index(['ID', 'hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

In [243]:
train_df = train_df.drop(columns=['ID','adr','arrival_date_year','reservation_status','arrival_date_month', 'reservation_status_date','agent','company'])
train_df['country']=train_df['country'].fillna(train_df['country'].value_counts().index[0])
train_df['children']=train_df['children'].fillna(train_df['children'].median())



In [244]:
#train_df.info()

In [245]:
le = LabelEncoder()
categorical_features = list(train_df.columns[train_df.dtypes == object])
train_df[categorical_features]=train_df[categorical_features].apply(lambda x: le.fit_transform(x))


In [246]:
x = train_df.drop(columns=['is_canceled'])
y = train_df['is_canceled']


In [249]:
x.shape

(91531, 24)

In [247]:
x.to_csv('x_train_data.csv',index=False)
y.to_csv('y_label_data.csv',index=False)

In [119]:
x_train, x_val, y_train, y_val = train_test_split(x,y,random_state=49)

In [120]:
x_train

Unnamed: 0,hotel,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests
25825,0,2,7,13,0,1,2,0.0,0,0,...,0,0,0,0,0,0,0,2,0,1
69188,0,34,44,26,0,3,2,0.0,0,3,...,0,0,0,0,0,0,0,2,0,3
46854,0,160,25,17,1,2,2,0.0,0,0,...,0,0,3,3,0,0,0,3,0,1
46796,0,2,25,16,0,2,2,0.0,0,3,...,0,0,0,3,0,0,0,2,0,1
15081,0,61,43,19,1,4,2,0.0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7924,0,16,37,9,0,2,2,0.0,0,0,...,0,0,0,0,0,0,0,0,0,2
19638,0,198,49,5,2,1,1,0.0,0,2,...,0,0,0,0,0,1,0,2,0,0
81397,0,59,4,23,1,1,2,0.0,0,0,...,0,0,0,0,0,1,0,2,0,0
54957,0,190,33,8,1,2,2,0.0,0,3,...,0,0,0,0,1,0,0,3,1,2


In [121]:
y_train

25825    0
69188    1
46854    0
46796    0
15081    0
        ..
7924     0
19638    1
81397    1
54957    0
426      1
Name: is_canceled, Length: 68648, dtype: int64

In [122]:
rf_params = {
    'n_estimators': [150],  
    'max_features':[20],
    'max_depth': [13]
}

# Perform Grid Search
rf_gs = GridSearchCV(RandomForestClassifier(random_state=49),
                 rf_params,
                 cv = 5,
                 scoring = 'accuracy')
rf = rf_gs.fit(x_train, y_train)
print(f'Best Training Accuracy: {rf.score(x_train, y_train)}')
print(f'Best Testing Accuracy: {rf.score(x_val, y_val)}')

Best Training Accuracy: 0.8834343316629764
Best Testing Accuracy: 0.8673250884936415


In [123]:
dt_params = {
    'max_depth':[None],
    'max_features' : [20],
    'min_samples_split': [25], 
    'min_samples_leaf': [1]
}

# Perform Grid Search
dt_gs = GridSearchCV(DecisionTreeClassifier(random_state=49),
                 dt_params,
                 cv = 5,
                 scoring = 'accuracy')
dt = dt_gs.fit(x_train, y_train)
print(f'Best Training Accuracy: {dt.score(x_train, y_train)}')
print(f'Best Testing Accuracy: {dt.score(x_val, y_val)}')

Best Training Accuracy: 0.9182059200559375
Best Testing Accuracy: 0.8581916706725516


In [126]:
test_df = pd.read_csv('test.csv')

In [127]:
test_df = test_df.drop(columns=['ID','agent','company','arrival_date_year','arrival_date_month'])
test_df['country']=test_df['country'].fillna(train_df['country'].value_counts().index[0])
test_df['children']=test_df['children'].fillna(test_df['children'].median()).astype(int)


In [128]:
#test_df.info()

In [129]:

categorical_features = list(test_df.columns[test_df.dtypes == object])
test_df[categorical_features] = test_df[categorical_features].astype(str)
test_df[categorical_features]=test_df[categorical_features].apply(lambda x: le.fit_transform(x))


In [130]:
test_df

Unnamed: 0,hotel,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests
0,0,75,13,1,2,5,2,0,0,0,...,0,0,3,3,0,0,0,2,0,1
1,0,208,13,1,4,10,2,0,0,3,...,0,0,0,0,0,0,0,2,0,1
2,1,12,13,1,2,5,2,0,0,4,...,0,0,0,0,0,0,0,0,0,1
3,0,76,13,1,2,5,3,0,0,0,...,0,0,3,3,0,0,0,2,0,0
4,0,9,13,1,2,4,2,2,0,0,...,0,0,5,5,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27854,1,108,35,31,2,5,2,0,0,2,...,0,0,4,4,0,0,0,2,0,1
27855,1,194,35,31,2,5,2,1,0,2,...,0,0,6,6,3,0,0,2,1,1
27856,1,17,35,31,0,3,2,0,0,2,...,0,0,0,0,0,0,0,2,0,2
27857,1,191,35,31,2,5,2,0,0,2,...,0,0,3,3,0,0,0,0,0,0


In [132]:
can_pred = rf.predict(test_df)
np.sum(can_pred==1)

7575

# adr

In [163]:
adr_df = pd.read_csv('train.csv')
adr_df.columns

Index(['ID', 'hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

In [164]:
adr_df = adr_df.drop(columns=['ID','arrival_date_year','reservation_status','arrival_date_month', 'reservation_status_date','agent','company'])
adr_df = adr_df.drop(index=31980)
adr_df['country']=adr_df['country'].fillna(adr_df['country'].value_counts().index[0])
adr_df['children']=adr_df['children'].fillna(adr_df['children'].median())



In [165]:
le = LabelEncoder()
categorical_features = list(adr_df.columns[adr_df.dtypes == object])
adr_df[categorical_features]=adr_df[categorical_features].apply(lambda x: le.fit_transform(x))


In [166]:
adr_x = adr_df.drop(columns=['adr'])
adr_y = np.array(adr_df['adr'])

In [167]:
xadr_train, xadr_val, yadr_train, yadr_val = train_test_split(adr_x,adr_y,random_state=49)

In [168]:
yadr_train

array([258.84201536,  81.05465741, 133.72149024, ...,  91.75969222,
        51.91740188,  79.8468171 ])

In [169]:
ss_x = MinMaxScaler()
ss_y = MinMaxScaler()
yadr_train = np.reshape(yadr_train,(-1,1))
yadr_val = np.reshape(yadr_val,(-1,1))
print(ss_x.fit(xadr_train))
xtrain_scale=ss_x.transform(xadr_train)
print(ss_x.fit(xadr_val))
xval_scale=ss_x.transform(xadr_val)

print(ss_y.fit(yadr_train))
ytrain_scale=ss_y.transform(yadr_train)
print(ss_y.fit(yadr_val))
yval_scale=ss_y.transform(yadr_val)

MinMaxScaler()
MinMaxScaler()
MinMaxScaler()
MinMaxScaler()


In [170]:
xtrain_scale

array([[1.        , 1.        , 0.20217096, ..., 0.66666667, 0.        ,
        0.        ],
       [1.        , 1.        , 0.24830393, ..., 0.66666667, 0.        ,
        0.        ],
       [0.        , 0.        , 0.0027137 , ..., 0.66666667, 0.        ,
        0.2       ],
       ...,
       [0.        , 1.        , 0.08005427, ..., 0.66666667, 0.        ,
        0.        ],
       [0.        , 0.        , 0.21166893, ..., 0.66666667, 0.        ,
        0.        ],
       [0.        , 1.        , 0.11261872, ..., 0.66666667, 0.        ,
        0.2       ]])

In [171]:
ytrain_scale

array([[0.62265212],
       [0.3483351 ],
       [0.42959739],
       ...,
       [0.36485243],
       [0.30337777],
       [0.34647146]])

In [172]:
rfr_params = {
    'n_estimators': [150],  
    'max_features':[20],
    'max_depth': [13]
}

# Perform Grid Search
rfr_gs = GridSearchCV(RandomForestRegressor(random_state=49),
                 rfr_params,
                 cv = 5,
                 scoring = 'r2')
rfr = rfr_gs.fit(xtrain_scale, ytrain_scale)
print(f'Best Training Accuracy: {rfr.score(xtrain_scale, ytrain_scale)}')
print(f'Best Testing Accuracy: {rfr.score(xval_scale, yval_scale)}')

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


Best Training Accuracy: 0.8534440303264799
Best Testing Accuracy: 0.42461978213593143


In [174]:
dtr_params = {
    'max_depth':[None],
    'max_features' : [20],
    'min_samples_split': [25], 
    'min_samples_leaf': [1]
}

# Perform Grid Search
dtr_gs = GridSearchCV(DecisionTreeRegressor(random_state=49),
                 dtr_params,
                 cv = 5,
                 scoring = 'r2')
dtr = dtr_gs.fit(xtrain_scale, ytrain_scale)
print(f'Best Training Accuracy: {dtr.score(xtrain_scale, ytrain_scale)}')
print(f'Best Testing Accuracy: {dtr.score(xval_scale, yval_scale)}')

Best Training Accuracy: 0.8809600574408805
Best Testing Accuracy: 0.08187200226740043


In [177]:
svr_params = {
    'kernel' : ['rbf','linear','poly','sigmoid'],
    'C' : [0.1,1,10]
}

# Perform Grid Search
svr_gs = GridSearchCV(SVR(),
                 svr_params,
                 cv = 5,
                 scoring = 'r2')
svr = svr_gs.fit(xtrain_scale, ytrain_scale)
print(f'Best Training Accuracy: {svr.score(xtrain_scale, ytrain_scale)}')
print(f'Best Testing Accuracy: {svr.score(xval_scale, yval_scale)}')

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


Best Training Accuracy: 0.6043731810856163
Best Testing Accuracy: -0.11255634242764367


In [192]:
adr_test = pd.read_csv('test.csv')
adr_test.columns

Index(['ID', 'hotel', 'lead_time', 'arrival_date_year', 'arrival_date_month',
       'arrival_date_week_number', 'arrival_date_day_of_month',
       'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
       'babies', 'meal', 'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type',
       'required_car_parking_spaces', 'total_of_special_requests'],
      dtype='object')

In [193]:
adr_test = adr_test.drop(columns=['ID','arrival_date_year','arrival_date_month','agent','company'])
adr_test['country']=adr_test['country'].fillna(adr_test['country'].value_counts().index[0])
adr_test['children']=adr_test['children'].fillna(adr_test['children'].median())


In [194]:
adr_test.columns

Index(['hotel', 'lead_time', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type',
       'days_in_waiting_list', 'customer_type', 'required_car_parking_spaces',
       'total_of_special_requests'],
      dtype='object')

In [195]:
can_pred

array([0, 0, 0, ..., 0, 0, 0])

In [196]:
adr_test.insert(1,'is_canceled',can_pred)

In [201]:
adr_test.columns == adr_x.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [203]:

categorical_features = list(adr_test.columns[adr_test.dtypes == object])
adr_test[categorical_features]=adr_test[categorical_features].apply(lambda x: le.fit_transform(x))


In [204]:
print(ss_x.fit(adr_test))

MinMaxScaler()


In [205]:
adr_scaler = ss_x.transform(adr_test)

In [206]:
adr_scaler

array([[0.        , 0.        , 0.14395393, ..., 0.66666667, 0.        ,
        0.2       ],
       [0.        , 0.        , 0.39923225, ..., 0.66666667, 0.        ,
        0.2       ],
       [1.        , 0.        , 0.02303263, ..., 0.        , 0.        ,
        0.2       ],
       ...,
       [1.        , 0.        , 0.03262956, ..., 0.66666667, 0.        ,
        0.4       ],
       [1.        , 0.        , 0.36660269, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00575816, ..., 0.66666667, 0.        ,
        0.8       ]])

In [208]:
adr_pred = dtr.predict(adr_scaler)

In [211]:
adr_pred = np.reshape(adr_pred,(-1,1))
adr_pred

array([[0.3410053 ],
       [0.31977677],
       [0.26490193],
       ...,
       [0.38746353],
       [0.3238694 ],
       [0.34845043]])

In [235]:
fin_adr = ss_y.inverse_transform(adr_pred)
len(fin_adr)

27859

In [236]:
adr_add = pd.read_csv('adr.csv')
adr_add['DTR'] = fin_adr
adr_add.to_csv('adr.csv',index=False)

In [241]:
def save_reult(fin_adr, fin_can):
    ans = pd.read_csv('test_nolabel.csv')
    df = pd.read_csv('test.csv')
    df['arrival_date_full'] = df['arrival_date_year'].astype(str) + "-" + df['arrival_date_month'].map({'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}).astype(str) + "-" + df['arrival_date_day_of_month'].astype(str)
    df['arrival_date_full'] = pd.to_datetime(df['arrival_date_full'], format="%Y-%m-%d")
    df['status_minus_arrival_date'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']
    tem = pd.DataFrame(fin_adr,columns=['adr'])
    tem['canceled'] = fin_can
    tem['canceled'] = np.where(tem['canceled']==0,1,0)
    tem['arrival_date_full'] = df['arrival_date_full']
    tem['status_minus_arrival_date'] = df['status_minus_arrival_date']
    tem['score'] = (tem['status_minus_arrival_date'] * tem['canceled'] * tem['adr'])
    tem = tem.groupby(tem.arrival_date_full).sum().astype(int)
    tem['score'] = ((tem['score'])/10000).astype(int)
    ans['label'] = tem['score'].values
    ans.to_csv('final_ans_dtr.csv',index=False)