### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Import data 

In [2]:
df=pd.read_csv('hotelsBookingDemand.csv')
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [51]:
df[['lead_time','agent']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   lead_time  119390 non-null  int64  
 1   agent      103050 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.8 MB


## II. Data preprocessing

### 1. Verify missing values

In [3]:
df.isna().sum(axis=0)

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [4]:
df.isna().sum(axis=0)/len(df) * 100

hotel                              0.000000
is_canceled                        0.000000
lead_time                          0.000000
arrival_date_year                  0.000000
arrival_date_month                 0.000000
arrival_date_week_number           0.000000
arrival_date_day_of_month          0.000000
stays_in_weekend_nights            0.000000
stays_in_week_nights               0.000000
adults                             0.000000
children                           0.003350
babies                             0.000000
meal                               0.000000
country                            0.408744
market_segment                     0.000000
distribution_channel               0.000000
is_repeated_guest                  0.000000
previous_cancellations             0.000000
previous_bookings_not_canceled     0.000000
reserved_room_type                 0.000000
assigned_room_type                 0.000000
booking_changes                    0.000000
deposit_type                    

### 2. Drop rows having missing values except for variables like Agent or Company, “NULL” is presented as one of the categories

In [5]:
#drop rows having missing values in country column

df_drop = df.dropna(subset=['country','children'])
df_drop

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


In [6]:
df_drop.isna().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               0
babies                                 0
meal                                   0
country                                0
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16004
company         

### 3. Change arrival year, month and day feature to datetime format called arrival_date.

In [7]:
df_drop[['arrival_date_year','arrival_date_month','arrival_date_day_of_month']]

Unnamed: 0,arrival_date_year,arrival_date_month,arrival_date_day_of_month
0,2015,July,1
1,2015,July,1
2,2015,July,1
3,2015,July,1
4,2015,July,1
...,...,...,...
119385,2017,August,30
119386,2017,August,31
119387,2017,August,31
119388,2017,August,31


In [8]:
df_date=df_drop[['arrival_date_year','arrival_date_month','arrival_date_day_of_month']]

In [9]:
from datetime import datetime

def getDatetime(year,month,day):
    datestring = day+"-"+month[:3]+"-"+year
    dt = datetime.strptime(datestring, '%d-%b-%Y')
    if dt.month > 9 :
        return f'{dt.year}-{dt.month}-{dt.day}'
    elif dt.day>9 : 
        return f'{dt.year}-0{dt.month}-{dt.day}'
    else : return f'{dt.year}-0{dt.month}-0{dt.day}'

In [10]:
print(getDatetime("2015","january","14"))

2015-01-14


In [11]:
#df_date = df_date.drop('arrival_date',axis=1)

In [12]:
df_date['arrival_date_str']= df_date["arrival_date_year"].astype(str) +"-"+ df_date["arrival_date_month"].astype(str)+"-"+df_date['arrival_date_day_of_month'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date['arrival_date_str']= df_date["arrival_date_year"].astype(str) +"-"+ df_date["arrival_date_month"].astype(str)+"-"+df_date['arrival_date_day_of_month'].astype(str)


In [13]:
df_date['arrival_date']= df_date['arrival_date_str'].apply(lambda x: getDatetime(x.split('-')[0],x.split('-')[1],x.split('-')[2]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date['arrival_date']= df_date['arrival_date_str'].apply(lambda x: getDatetime(x.split('-')[0],x.split('-')[1],x.split('-')[2]))


In [14]:
df_date

Unnamed: 0,arrival_date_year,arrival_date_month,arrival_date_day_of_month,arrival_date_str,arrival_date
0,2015,July,1,2015-July-1,2015-07-01
1,2015,July,1,2015-July-1,2015-07-01
2,2015,July,1,2015-July-1,2015-07-01
3,2015,July,1,2015-July-1,2015-07-01
4,2015,July,1,2015-July-1,2015-07-01
...,...,...,...,...,...
119385,2017,August,30,2017-August-30,2017-08-30
119386,2017,August,31,2017-August-31,2017-08-31
119387,2017,August,31,2017-August-31,2017-08-31
119388,2017,August,31,2017-August-31,2017-08-31


In [15]:
df_drop['arrival_date']=df_date['arrival_date']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_drop['arrival_date']=df_date['arrival_date']


In [16]:
df_drop['arrival_date']

0         2015-07-01
1         2015-07-01
2         2015-07-01
3         2015-07-01
4         2015-07-01
             ...    
119385    2017-08-30
119386    2017-08-31
119387    2017-08-31
119388    2017-08-31
119389    2017-08-29
Name: arrival_date, Length: 118898, dtype: object

### 4. Verify that the timestamp of the variable reservation_status_date must occur after or at the same date as the input variable arrival_date

In [17]:
df_drop['reservation_status_date']

0         2015-07-01
1         2015-07-01
2         2015-07-02
3         2015-07-02
4         2015-07-03
             ...    
119385    2017-09-06
119386    2017-09-07
119387    2017-09-07
119388    2017-09-07
119389    2017-09-07
Name: reservation_status_date, Length: 118898, dtype: object

In [18]:
df_drop['arrival_date']

0         2015-07-01
1         2015-07-01
2         2015-07-01
3         2015-07-01
4         2015-07-01
             ...    
119385    2017-08-30
119386    2017-08-31
119387    2017-08-31
119388    2017-08-31
119389    2017-08-29
Name: arrival_date, Length: 118898, dtype: object

In [19]:
df_drop['reservation_valid']=df_drop['arrival_date'].map(lambda x:datetime.strptime(x, '%Y-%m-%d')) <= df_drop['reservation_status_date'].map(lambda x:datetime.strptime(x, '%Y-%m-%d'))
#datetime.strptime(datestring, '%d-%b-%Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_drop['reservation_valid']=df_drop['arrival_date'].map(lambda x:datetime.strptime(x, '%Y-%m-%d')) <= df_drop['reservation_status_date'].map(lambda x:datetime.strptime(x, '%Y-%m-%d'))


In [20]:
df_drop['reservation_valid'].value_counts()

True     76811
False    42087
Name: reservation_valid, dtype: int64

In [56]:
df_drop.fillna(value={'agent':0})

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,reservation_valid
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,2015-07-01,True
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,2015-07-01,True
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,2015-07-01,True
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,2015-07-01,True
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,,0,Transient,98.00,0,1,Check-Out,2015-07-03,2015-07-01,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,,0,Transient,96.14,0,0,Check-Out,2017-09-06,2017-08-30,True
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,,0,Transient,225.43,0,2,Check-Out,2017-09-07,2017-08-31,True
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,,0,Transient,157.71,0,4,Check-Out,2017-09-07,2017-08-31,True
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,,0,Transient,104.40,0,0,Check-Out,2017-09-07,2017-08-31,True


### 5. Propose a preprocessing to be made on this dataset.

In [21]:
df_pre= df_drop.drop(columns=['arrival_date_year','arrival_date_month','arrival_date_week_number','arrival_date_day_of_month'])

In [22]:
df_pre.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118898 entries, 0 to 119389
Data columns (total 30 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           118898 non-null  object 
 1   is_canceled                     118898 non-null  int64  
 2   lead_time                       118898 non-null  int64  
 3   stays_in_weekend_nights         118898 non-null  int64  
 4   stays_in_week_nights            118898 non-null  int64  
 5   adults                          118898 non-null  int64  
 6   children                        118898 non-null  float64
 7   babies                          118898 non-null  int64  
 8   meal                            118898 non-null  object 
 9   country                         118898 non-null  object 
 10  market_segment                  118898 non-null  object 
 11  distribution_channel            118898 non-null  object 
 12  is_repeated_gues

In [52]:
df_drop[]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,reservation_valid
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,2015-07-01,True
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,2015-07-01,True
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,2015-07-01,True
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,2015-07-01,True
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,,0,Transient,98.00,0,1,Check-Out,2015-07-03,2015-07-01,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,,0,Transient,96.14,0,0,Check-Out,2017-09-06,2017-08-30,True
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,,0,Transient,225.43,0,2,Check-Out,2017-09-07,2017-08-31,True
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,,0,Transient,157.71,0,4,Check-Out,2017-09-07,2017-08-31,True
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,,0,Transient,104.40,0,0,Check-Out,2017-09-07,2017-08-31,True


## 4-cancellation :

In [23]:
cancellation_df = df_pre[df_pre['is_canceled']==1]
cancellation_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,reservation_valid
8,Resort Hotel,1,85,0,3,2,0.0,0,BB,PRT,...,,0,Transient,82.0,0,1,Canceled,2015-05-06,2015-07-01,False
9,Resort Hotel,1,75,0,3,2,0.0,0,HB,PRT,...,,0,Transient,105.5,0,0,Canceled,2015-04-22,2015-07-01,False
10,Resort Hotel,1,23,0,4,2,0.0,0,BB,PRT,...,,0,Transient,123.0,0,0,Canceled,2015-06-23,2015-07-01,False
27,Resort Hotel,1,60,2,5,2,0.0,0,BB,PRT,...,,0,Transient,107.0,0,2,Canceled,2015-05-11,2015-07-01,False
32,Resort Hotel,1,96,2,8,2,0.0,0,BB,PRT,...,,0,Transient,108.3,0,2,Canceled,2015-05-29,2015-07-01,False


In [24]:
#cancellation_df.plot.bar()

# IV. Modeling :

In [25]:
main_cols = df_pre.columns.difference(['company', 'agent', 'country','children', 'reservation_status_date', 'arrival_date']).tolist()
df_pre = df_pre[main_cols]
df_pre.head()

Unnamed: 0,adr,adults,assigned_room_type,babies,booking_changes,customer_type,days_in_waiting_list,deposit_type,distribution_channel,hotel,...,meal,previous_bookings_not_canceled,previous_cancellations,required_car_parking_spaces,reservation_status,reservation_valid,reserved_room_type,stays_in_week_nights,stays_in_weekend_nights,total_of_special_requests
0,0.0,2,C,0,3,Transient,0,No Deposit,Direct,Resort Hotel,...,BB,0,0,0,Check-Out,True,C,0,0,0
1,0.0,2,C,0,4,Transient,0,No Deposit,Direct,Resort Hotel,...,BB,0,0,0,Check-Out,True,C,0,0,0
2,75.0,1,C,0,0,Transient,0,No Deposit,Direct,Resort Hotel,...,BB,0,0,0,Check-Out,True,A,1,0,0
3,75.0,1,A,0,0,Transient,0,No Deposit,Corporate,Resort Hotel,...,BB,0,0,0,Check-Out,True,A,1,0,0
4,98.0,2,A,0,0,Transient,0,No Deposit,TA/TO,Resort Hotel,...,BB,0,0,0,Check-Out,True,A,2,0,1


In [26]:
df_pre_object = df_pre.select_dtypes(include=['object']).copy()
df_pre_object.head()

Unnamed: 0,assigned_room_type,customer_type,deposit_type,distribution_channel,hotel,market_segment,meal,reservation_status,reserved_room_type
0,C,Transient,No Deposit,Direct,Resort Hotel,Direct,BB,Check-Out,C
1,C,Transient,No Deposit,Direct,Resort Hotel,Direct,BB,Check-Out,C
2,C,Transient,No Deposit,Direct,Resort Hotel,Direct,BB,Check-Out,A
3,A,Transient,No Deposit,Corporate,Resort Hotel,Corporate,BB,Check-Out,A
4,A,Transient,No Deposit,TA/TO,Resort Hotel,Online TA,BB,Check-Out,A


In [27]:
df_pre["assigned_room_type"].value_counts()

A    73863
D    25166
E     7738
F     3732
G     2539
C     2354
B     2159
H      708
I      357
K      279
P        2
L        1
Name: assigned_room_type, dtype: int64

In [28]:
df_pre["customer_type"].value_counts()

Transient          89174
Transient-Party    25078
Contract            4076
Group                570
Name: customer_type, dtype: int64

In [29]:
df_pre["deposit_type"].value_counts()

No Deposit    104163
Non Refund     14573
Refundable       162
Name: deposit_type, dtype: int64

In [30]:
df_pre["distribution_channel"].value_counts()

TA/TO        97730
Direct       14483
Corporate     6491
GDS            193
Undefined        1
Name: distribution_channel, dtype: int64

In [31]:
df_pre["hotel"].value_counts()

City Hotel      79302
Resort Hotel    39596
Name: hotel, dtype: int64

In [32]:
df_pre["market_segment"].value_counts()

Online TA        56402
Offline TA/TO    24160
Groups           19806
Direct           12448
Corporate         5111
Complementary      734
Aviation           237
Name: market_segment, dtype: int64

In [33]:
df_pre["meal"].value_counts()

BB           91863
HB           14434
SC           10638
Undefined     1165
FB             798
Name: meal, dtype: int64

In [34]:
df_pre["reservation_status"].value_counts()

Check-Out    74745
Canceled     42950
No-Show       1203
Name: reservation_status, dtype: int64

In [35]:
df_pre["reserved_room_type"].value_counts()

A    85601
D    19173
E     6497
F     2890
G     2083
B     1114
C      931
H      601
L        6
P        2
Name: reserved_room_type, dtype: int64

In [36]:
# One hot encoding
df_pre = pd.get_dummies(df_pre, columns = ['hotel', 'meal', 'market_segment', 'distribution_channel', 'assigned_room_type',
                                           'reserved_room_type', 'deposit_type', 'customer_type', 'reservation_status'])


In [37]:
df_pre.head()

Unnamed: 0,adr,adults,babies,booking_changes,days_in_waiting_list,is_canceled,is_repeated_guest,lead_time,previous_bookings_not_canceled,previous_cancellations,...,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,reservation_status_Canceled,reservation_status_Check-Out,reservation_status_No-Show
0,0.0,2,0,3,0,0,0,342,0,0,...,1,0,0,0,0,1,0,0,1,0
1,0.0,2,0,4,0,0,0,737,0,0,...,1,0,0,0,0,1,0,0,1,0
2,75.0,1,0,0,0,0,0,7,0,0,...,1,0,0,0,0,1,0,0,1,0
3,75.0,1,0,0,0,0,0,13,0,0,...,1,0,0,0,0,1,0,0,1,0
4,98.0,2,0,0,0,0,0,14,0,0,...,1,0,0,0,0,1,0,0,1,0


In [38]:
X = df_pre.drop('is_canceled', axis = 1)
y = df_pre['is_canceled']

In [39]:
# Splitting Data
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [40]:
# Libs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [41]:
np.random.seed (42)
models = {
    "LogisticRegression" :LogisticRegression() ,
    "DecisionTreeClassifier" : DecisionTreeClassifier(),
    "RandomForestClassifier" : RandomForestClassifier(),
    "KNN" : KNeighborsClassifier(),
    "SVC" : SVC(),
}
results = {}

In [42]:
example_results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    results[model_name] = model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
sort_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
sort_results

[('LogisticRegression', 1.0),
 ('DecisionTreeClassifier', 1.0),
 ('RandomForestClassifier', 1.0),
 ('SVC', 0.9849453322119428),
 ('KNN', 0.9023969722455846)]

In [46]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Train a model
model = LGBMClassifier(random_state=3031)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Check score
accuracy_score(y_test, y_pred)

1.0