In [22]:
## Dependencies
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [19]:
##Ilya, your code starts here

In [20]:
file = "data_cleaning/Output/cleaned_data.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,zipcode,room_type,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews,price,price_bin
0,94117,Entire home/apt,3,1.0,1.0,2.0,1,240,170.0,bin 170-179.99
1,94110,Entire home/apt,5,2.0,1.0,3.0,30,111,235.0,bin 230-239.99
2,94117,Private room,2,1.0,4.0,1.0,32,19,65.0,bin 60-69.99
3,94117,Private room,2,1.0,4.0,1.0,32,8,65.0,bin 60-69.99
4,94117,Entire home/apt,4,2.0,1.5,2.0,5,28,703.0,bin 700-709.99


In [23]:
#this is your X
data = df.drop(["price", "price_bin"], axis=1)
feature_names = data.columns
data.head()
#this is your y
target = df["price_bin"].map(lambda x:str(x))
#list(target)

In [24]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode
    def fit(self,X,y=None):
        return self # not relevant here
    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [25]:
#LabelEncode room_type and zipcode.  No need to one-hot encode
data2=MultiColumnLabelEncoder(columns = ['room_type', 'zipcode']).fit_transform(data)
data2.head()

Unnamed: 0,zipcode,room_type,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews
0,14,0,3,1.0,1.0,2.0,1,240
1,8,0,5,2.0,1.0,3.0,30,111
2,14,1,2,1.0,4.0,1.0,32,19
3,14,1,2,1.0,4.0,1.0,32,8
4,14,0,4,2.0,1.5,2.0,5,28


In [26]:
X_train, X_test, y_train, y_test = train_test_split(data2, target, random_state=4)

In [27]:
X_minmax = MinMaxScaler().fit(X_train)
X_train_scaled = X_minmax.transform(X_train)
X_test_scaled= X_minmax.transform(X_test)
X_train_scaled

array([[7.69230769e-02, 5.00000000e-01, 6.66666667e-02, ...,
        7.14285714e-02, 0.00000000e+00, 2.19244823e-02],
       [3.84615385e-01, 5.00000000e-01, 6.66666667e-02, ...,
        7.14285714e-02, 2.90000003e-07, 0.00000000e+00],
       [1.53846154e-01, 0.00000000e+00, 6.66666667e-02, ...,
        7.14285714e-02, 2.90000003e-07, 1.21802680e-03],
       ...,
       [3.84615385e-02, 0.00000000e+00, 6.66666667e-02, ...,
        7.14285714e-02, 0.00000000e+00, 9.62241169e-02],
       [9.23076923e-01, 0.00000000e+00, 2.00000000e-01, ...,
        1.42857143e-01, 2.90000003e-07, 8.89159562e-02],
       [8.46153846e-01, 0.00000000e+00, 6.66666667e-02, ...,
        7.14285714e-02, 2.90000003e-07, 1.82704019e-02]])

In [29]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
print(f'Training score: {model.score(X_train, y_train)}')
print(f'Testing score: {model.score(X_test, y_test)}')

Training score: 0.04230176400068505
Testing score: 0.05084745762711865


In [33]:
# Create the GridSearchCV model
param_grid = {'C': [1.0, 5.0, 10.0],
              'penalty': ["l1", "l2"]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [34]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1.0, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1.0, penalty=l1, score=0.108, total=   6.6s
[CV] C=1.0, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.6s remaining:    0.0s


[CV] ................... C=1.0, penalty=l1, score=0.107, total=   8.1s
[CV] C=1.0, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.6s remaining:    0.0s


[CV] ................... C=1.0, penalty=l1, score=0.117, total=   0.9s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ................... C=1.0, penalty=l2, score=0.041, total=   1.4s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ................... C=1.0, penalty=l2, score=0.042, total=   1.6s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ................... C=1.0, penalty=l2, score=0.117, total=   1.1s
[CV] C=5.0, penalty=l1 ...............................................
[CV] ................... C=5.0, penalty=l1, score=0.107, total=   6.6s
[CV] C=5.0, penalty=l1 ...............................................
[CV] ................... C=5.0, penalty=l1, score=0.113, total=  11.4s
[CV] C=5.0, penalty=l1 ...............................................
[CV] ................... C=5.0, penalty=l1, score=0.115, total=   1.1s
[CV] C=5.0, penalty=l2 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  1.1min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1.0, 5.0, 10.0], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [35]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10.0, 'penalty': 'l1'}
0.11217674259290975
