In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [107]:
dataset=pd.read_csv('hotelPriceCleaned.csv')
dataset.isna().sum()

hotel                             0
is_canceled                       0
lead_time                         0
meal                              0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
booking_changes                   0
agent                             0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
total_members                     0
total_nights_stayed               0
dtype: int64

In [108]:
dataset.head()

Unnamed: 0,hotel,is_canceled,lead_time,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,total_members,total_nights_stayed
0,-1.437393,-0.640696,-0.939416,0.187305,1.16863,2.118765,-0.172577,-0.081677,-0.088724,-0.561389,-0.372516,-0.590609,-0.067037,0.392233,-0.931096,-0.302727,-0.881914,-1.588769,-1.023513
1,-1.437393,-0.640696,-0.85886,0.187305,-2.967412,-2.601195,-0.172577,-0.081677,-0.088724,-0.561389,-0.372516,2.162981,-0.067037,0.392233,-0.931096,-0.302727,-0.881914,-1.588769,-1.023513
2,-1.437393,-0.640696,-0.845434,0.187305,0.341421,-0.241215,-0.172577,-0.081677,-0.088724,-0.561389,-0.372516,1.555292,-0.067037,0.392233,-0.354134,-0.302727,0.311727,-0.093117,-0.643442
3,-1.437393,-0.640696,-1.033397,0.187305,1.16863,2.118765,-0.172577,-0.081677,-0.088724,1.326522,-0.372516,-0.590609,-0.067037,0.392233,-0.128367,-0.302727,-0.881914,-0.093117,-0.643442
4,-1.437393,-0.640696,-0.912564,2.779896,1.16863,2.118765,-0.172577,-0.081677,-0.088724,1.326522,-0.372516,2.153486,-0.067037,0.392233,-0.228708,-0.302727,0.311727,-0.093117,-0.643442


In [109]:
## Capture the dependent feature
y_train=dataset[['adr']]
X_train=dataset.drop(['adr'],axis=1)

In [110]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [111]:
feature_sel_model.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True, False, False,  True,  True,  True,  True,  True])

In [112]:
# this is how we can make a list of the selected features
selected_feat = X_train.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
selected_feat

total features: 18
selected features: 15


Index(['hotel', 'is_canceled', 'lead_time', 'meal', 'market_segment',
       'distribution_channel', 'is_repeated_guest', 'previous_cancellations',
       'reserved_room_type', 'booking_changes', 'customer_type',
       'required_car_parking_spaces', 'total_of_special_requests',
       'total_members', 'total_nights_stayed'],
      dtype='object')

In [113]:
X_train[selected_feat].head()

Unnamed: 0,hotel,is_canceled,lead_time,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,reserved_room_type,booking_changes,customer_type,required_car_parking_spaces,total_of_special_requests,total_members,total_nights_stayed
0,-1.437393,-0.640696,-0.939416,0.187305,1.16863,2.118765,-0.172577,-0.081677,-0.561389,-0.372516,0.392233,-0.302727,-0.881914,-1.588769,-1.023513
1,-1.437393,-0.640696,-0.85886,0.187305,-2.967412,-2.601195,-0.172577,-0.081677,-0.561389,-0.372516,0.392233,-0.302727,-0.881914,-1.588769,-1.023513
2,-1.437393,-0.640696,-0.845434,0.187305,0.341421,-0.241215,-0.172577,-0.081677,-0.561389,-0.372516,0.392233,-0.302727,0.311727,-0.093117,-0.643442
3,-1.437393,-0.640696,-1.033397,0.187305,1.16863,2.118765,-0.172577,-0.081677,1.326522,-0.372516,0.392233,-0.302727,-0.881914,-0.093117,-0.643442
4,-1.437393,-0.640696,-0.912564,2.779896,1.16863,2.118765,-0.172577,-0.081677,1.326522,-0.372516,0.392233,-0.302727,0.311727,-0.093117,-0.643442


In [114]:
for feature in selected_feat:
    print(feature)

hotel
is_canceled
lead_time
meal
market_segment
distribution_channel
is_repeated_guest
previous_cancellations
reserved_room_type
booking_changes
customer_type
required_car_parking_spaces
total_of_special_requests
total_members
total_nights_stayed


In [120]:
X_train[selected_feat].join(y_train).to_csv('hotelPriceCleanedFS.csv', index=False)