## Считывание данных

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')



In [3]:

train = pd.read_csv("train_final.csv")

print(f"Train dataset shape: {train.shape}")

Train dataset shape: (44638, 29)


In [4]:

test = pd.read_csv("test_final.csv")

print(f"Test dataset shape: {test.shape}")

Test dataset shape: (23525, 28)


In [None]:

train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date
0,Resort Hotel,0,3,2016,February,7,12,0,2,2,...,E,F,0,No Deposit,0,Transient,81.0,1,0,2016-02-14
1,City Hotel,1,50,2016,November,47,18,1,2,2,...,A,A,0,No Deposit,0,Transient,93.6,0,2,2016-10-09
2,City Hotel,0,14,2015,September,37,7,1,5,2,...,F,F,0,No Deposit,0,Contract,166.0,0,3,2015-09-13
3,City Hotel,0,18,2015,December,50,6,2,0,2,...,A,A,0,No Deposit,0,Transient,107.0,0,0,2015-12-08
4,City Hotel,1,263,2016,September,40,29,2,4,2,...,A,A,0,No Deposit,0,Transient,100.3,0,0,2016-01-18


## Исследование данных

In [None]:
print("Number of missing values in train dataset:")
print(train.isna().sum())
print("---null---")
print(train.isnull().sum())
print("Number of missing values in test dataset:")
print(test.isna().sum())

Number of missing values in train dataset:
hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44638 entries, 0 to 44637
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           44638 non-null  object 
 1   is_canceled                     44638 non-null  int64  
 2   lead_time                       44638 non-null  int64  
 3   arrival_date_year               44638 non-null  int64  
 4   arrival_date_month              44638 non-null  object 
 5   arrival_date_week_number        44638 non-null  int64  
 6   arrival_date_day_of_month       44638 non-null  int64  
 7   stays_in_weekend_nights         44638 non-null  int64  
 8   stays_in_week_nights            44638 non-null  int64  
 9   adults                          44638 non-null  int64  
 10  children                        44638 non-null  float64
 11  babies                          44638 non-null  int64  
 12  meal                            

**Проверка баланса классов.**


In [5]:
train['is_canceled'].value_counts()

0    31354
1    13284
Name: is_canceled, dtype: int64

In [6]:
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
y_data = pd.DataFrame(train['is_canceled'])Q
sns.countplot(x='is_canceled', data=y_data, palette='hls')
plt.show()

SyntaxError: invalid syntax (<ipython-input-6-e4b0ddf7a199>, line 3)

**Имбалансные**

## Предобработка данных

Исходя из анализа данных, можно сделать выводы:
- в данных пропусков нет
- в данных имеются категориальные признаки
- в задаче решается задача бинарной классификации

### Кодирование признаков

In [7]:
# Выбрем признаки, которые являются категориальными
train.loc[:, train.dtypes == object]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status_date
0,Resort Hotel,February,BB,PRT,Direct,Direct,E,F,No Deposit,Transient,2016-02-14
1,City Hotel,November,BB,BEL,Online TA,TA/TO,A,A,No Deposit,Transient,2016-10-09
2,City Hotel,September,BB,ITA,Online TA,TA/TO,F,F,No Deposit,Contract,2015-09-13
3,City Hotel,December,BB,ESP,Online TA,TA/TO,A,A,No Deposit,Transient,2015-12-08
4,City Hotel,September,BB,DEU,Online TA,TA/TO,A,A,No Deposit,Transient,2016-01-18
...,...,...,...,...,...,...,...,...,...,...,...
44633,City Hotel,June,BB,JPN,Online TA,TA/TO,D,D,No Deposit,Transient,2017-06-28
44634,Resort Hotel,September,BB,GBR,Online TA,TA/TO,E,F,No Deposit,Transient,2015-10-03
44635,City Hotel,April,SC,PRT,Online TA,TA/TO,A,A,No Deposit,Transient,2017-02-09
44636,City Hotel,September,HB,NOR,Offline TA/TO,TA/TO,A,A,No Deposit,Transient-Party,2016-09-12


In [8]:
train = train.drop(columns=['reservation_status_date', 'country', 'assigned_room_type', 'reserved_room_type'])
test = test.drop(columns=['reservation_status_date', 'country', 'assigned_room_type', 'reserved_room_type'])

In [None]:
cat_features = train.loc[:, train.dtypes == object].columns
cat_features

Index(['hotel', 'arrival_date_month', 'meal', 'market_segment',
       'distribution_channel', 'deposit_type', 'customer_type'],
      dtype='object')

In [None]:
train['hotel'].value_counts()

City Hotel      29335
Resort Hotel    15303
Name: hotel, dtype: int64

In [None]:
train['market_segment'].value_counts()

Online TA        21325
Offline TA/TO     9085
Groups            6868
Direct            4922
Corporate         2046
Complementary      308
Aviation            84
Name: market_segment, dtype: int64

In [None]:
train['deposit_type'].value_counts()

No Deposit    40235
Non Refund     4339
Refundable       64
Name: deposit_type, dtype: int64

In [None]:
#LabelEncorder

from sklearn import preprocessing

# 1. Создадим модель кодировщика
label_encoder = preprocessing.LabelEncoder()

# 2. Обучение кодировщика: по сути тут создает словарь уникальных слов из переданного массива,
# соотносит для каждого слова число
label_encoder.fit(train['arrival_date_month'])

# на этом этапе по сути применяет полученный словарь в переданным данных (в данном случае мы передали столбец число)
train['arrival_date_month'] = label_encoder.transform(train['arrival_date_month'])
# сразу используя этот кодировщик трансформируем данные теста
test['arrival_date_month'] = label_encoder.transform(test['arrival_date_month'])

train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,Resort Hotel,0,3,2016,3,7,12,0,2,2,...,0,0,0,0,No Deposit,0,Transient,81.0,1,0
1,City Hotel,1,50,2016,9,47,18,1,2,2,...,0,0,0,0,No Deposit,0,Transient,93.6,0,2
2,City Hotel,0,14,2015,11,37,7,1,5,2,...,0,0,0,0,No Deposit,0,Contract,166.0,0,3
3,City Hotel,0,18,2015,2,50,6,2,0,2,...,0,0,0,0,No Deposit,0,Transient,107.0,0,0
4,City Hotel,1,263,2016,11,40,29,2,4,2,...,0,0,0,0,No Deposit,0,Transient,100.3,0,0


In [None]:
label_encoder.classes_

array(['April', 'August', 'December', 'February', 'January', 'July',
       'June', 'March', 'May', 'November', 'October', 'September'],
      dtype=object)

In [None]:
test.loc[:, test.dtypes == object]

Unnamed: 0,hotel,meal,market_segment,distribution_channel,deposit_type,customer_type
0,City Hotel,SC,Online TA,TA/TO,No Deposit,Transient
1,City Hotel,SC,Online TA,TA/TO,No Deposit,Transient
2,City Hotel,BB,Online TA,TA/TO,No Deposit,Transient
3,City Hotel,BB,Direct,Direct,No Deposit,Transient
4,Resort Hotel,BB,Groups,Corporate,No Deposit,Transient-Party
...,...,...,...,...,...,...
23520,City Hotel,BB,Offline TA/TO,TA/TO,No Deposit,Transient
23521,City Hotel,BB,Corporate,Corporate,No Deposit,Transient
23522,City Hotel,BB,Corporate,Corporate,No Deposit,Transient
23523,Resort Hotel,BB,Groups,TA/TO,No Deposit,Transient-Party


In [None]:
test.loc[:, test.dtypes == object].columns

Index(['hotel', 'arrival_date_month', 'meal', 'market_segment',
       'distribution_channel', 'deposit_type', 'customer_type'],
      dtype='object')

In [None]:
from sklearn import preprocessing

categorical_columns = ['hotel', 'arrival_date_month', 'meal', 'market_segment', 'distribution_channel', 'deposit_type', 'customer_type']

for col in categorical_columns:
    le = preprocessing.LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,1,0,3,2016,3,7,12,0,2,2,...,0,0,0,0,0,0,2,81.0,1,0
1,0,1,50,2016,9,47,18,1,2,2,...,0,0,0,0,0,0,2,93.6,0,2
2,0,0,14,2015,11,37,7,1,5,2,...,0,0,0,0,0,0,0,166.0,0,3
3,0,0,18,2015,2,50,6,2,0,2,...,0,0,0,0,0,0,2,107.0,0,0
4,0,1,263,2016,11,40,29,2,4,2,...,0,0,0,0,0,0,2,100.3,0,0


In [None]:
test.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,City Hotel,73,2016,July,28,6,0,2,1,0.0,...,0,0,0,0,No Deposit,0,Transient,107.1,0,0
1,City Hotel,37,2015,October,43,24,2,5,1,1.0,...,0,0,0,0,No Deposit,0,Transient,87.78,0,0
2,City Hotel,190,2017,April,14,6,2,3,2,0.0,...,0,0,0,0,No Deposit,0,Transient,88.4,0,0
3,City Hotel,287,2016,August,35,24,1,4,2,0.0,...,0,0,0,1,No Deposit,0,Transient,76.71,0,0
4,Resort Hotel,386,2016,October,43,20,1,3,2,0.0,...,0,0,0,0,No Deposit,0,Transient-Party,49.0,0,0


In [9]:
# Попробуем другой вариант кодировщика
train = pd.get_dummies(train, columns=['hotel', 'arrival_date_month', 'meal', 'market_segment', 'distribution_channel', 'deposit_type', 'customer_type'])
train.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,distribution_channel_Direct,distribution_channel_GDS,distribution_channel_TA/TO,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,0,3,2016,7,12,0,2,2,0.0,0,...,1,0,0,1,0,0,0,0,1,0
1,1,50,2016,47,18,1,2,2,0.0,0,...,0,0,1,1,0,0,0,0,1,0
2,0,14,2015,37,7,1,5,2,2.0,0,...,0,0,1,1,0,0,1,0,0,0
3,0,18,2015,50,6,2,0,2,0.0,0,...,0,0,1,1,0,0,0,0,1,0
4,1,263,2016,40,29,2,4,2,0.0,0,...,0,0,1,1,0,0,0,0,1,0


In [None]:
train.dtypes

is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
booking_changes                     int64
days_in_waiting_list                int64
adr                               float64
required_car_parking_spaces         int64
total_of_special_requests           int64
hotel_City Hotel                    uint8
hotel_Resort Hotel                  uint8
arrival_date_month_April            uint8
arrival_date_month_August           uint8
arrival_date_month_December         uint8
arrival_date_month_February       

In [10]:
# Попробуем другой вариант кодировщика
test = pd.get_dummies(test, columns=['hotel', 'arrival_date_month', 'meal', 'market_segment', 'distribution_channel', 'deposit_type', 'customer_type'])

test.dtypes

lead_time                           int64
arrival_date_year                   int64
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
booking_changes                     int64
days_in_waiting_list                int64
adr                               float64
required_car_parking_spaces         int64
total_of_special_requests           int64
hotel_City Hotel                    uint8
hotel_Resort Hotel                  uint8
arrival_date_month_April            uint8
arrival_date_month_August           uint8
arrival_date_month_December         uint8
arrival_date_month_February         uint8
arrival_date_month_January        

## Обучение модели

In [11]:
# Выделим вектор признаков и вектор ответов
X = train.drop(columns=['is_canceled'])
y = train['is_canceled']

Всего у нас получилось 33 признака.

In [12]:
print(f'X shape: {X.shape}')
print(f'Y shape: {y.shape}')

X shape: (44638, 54)
Y shape: (44638,)


In [13]:
# Создадим модель логистической регрессии
model = LogisticRegression()

# обучение модели
model.fit(X, y)

# предсказание ответов для тестовой выборки
answers_pred = model.predict(X)

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print(f'Accuracy: {accuracy_score(y, answers_pred)}')
print(f'Precision: {precision_score(y, answers_pred)}')
print(f'Recall: {recall_score(y, answers_pred)}')

Accuracy: 0.8000358438998163
Precision: 0.8007176373171405
Recall: 0.43676603432700994


**LabelEncorder:**

*   Accuracy: 0.7766476992696806
*   Precision: 0.7482768954150435
*   Recall: 0.37594098163203854



**Get_dummies:**
*   Accuracy: 0.8000358438998163
*   Precision: 0.8007176373171405
*   Recall: 0.43676603432700994

Исходя из получченых данных, делаем вывод что кодировка с помощью метода Get_dummies имеет большую точность

## Предсказание ответа для тестового набора данных

In [15]:
test.shape

(23525, 54)

In [16]:
y_pred_test = model.predict(test)

In [17]:
y_pred_test

array([0, 0, 1, ..., 0, 0, 0])

In [22]:
y_pred_test = pd.DataFrame(y_pred_test, columns=['is_canceled'])
y_pred_test = y_pred_test.reset_index()

y_pred_test.to_csv("solution.csv", index=False)
