In [1]:
## Dependencies
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
##Ilya, your code starts here

In [3]:
file = "data_cleaning/Output/cleaned_data_5_bin.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,zipcode,room_type,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews,price,price_bin
0,94117,Entire home/apt,3,1.0,1.0,2.0,1,240,170.0,bin 166-235
1,94110,Entire home/apt,5,2.0,1.0,3.0,30,111,235.0,bin 166-235
2,94117,Private room,2,1.0,4.0,1.0,32,19,65.0,bin 0-85
3,94117,Private room,2,1.0,4.0,1.0,32,8,65.0,bin 0-85
4,94110,Private room,3,1.0,1.0,1.0,1,736,139.0,bin 123-165


In [4]:
#this is your X
data = df.drop(["price", "price_bin"], axis=1)
feature_names = data.columns
data.head()
#this is your y
target = df["price_bin"].map(lambda x:str(x))
#list(target)

In [5]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode
    def fit(self,X,y=None):
        return self # not relevant here
    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [6]:
#LabelEncode room_type and zipcode.  No need to one-hot encode
data2=MultiColumnLabelEncoder(columns = ['room_type', 'zipcode']).fit_transform(data)
data2.head()

Unnamed: 0,zipcode,room_type,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews
0,14,0,3,1.0,1.0,2.0,1,240
1,8,0,5,2.0,1.0,3.0,30,111
2,14,1,2,1.0,4.0,1.0,32,19
3,14,1,2,1.0,4.0,1.0,32,8
4,8,1,3,1.0,1.0,1.0,1,736


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data2, target, random_state=4)

In [8]:
X_minmax = MinMaxScaler().fit(X_train)
X_train_scaled = X_minmax.transform(X_train)
X_test_scaled= X_minmax.transform(X_test)
X_train_scaled

array([[0.46153846, 0.        , 0.2       , ..., 0.14285714, 0.02902903,
        0.        ],
       [0.03846154, 0.        , 0.2       , ..., 0.07142857, 0.05905906,
        0.00974421],
       [0.38461538, 0.        , 0.26666667, ..., 0.21428571, 0.002002  ,
        0.00609013],
       ...,
       [0.23076923, 0.5       , 0.2       , ..., 0.14285714, 0.        ,
        0.02314251],
       [0.57692308, 0.5       , 0.        , ..., 0.07142857, 0.001001  ,
        0.39342266],
       [0.30769231, 0.        , 0.06666667, ..., 0.07142857, 0.02902903,
        0.27161998]])

In [9]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
print(f'Training score: {model.score(X_train, y_train)}')
print(f'Testing score: {model.score(X_test, y_test)}')

Training score: 0.44989339019189767
Testing score: 0.45231752797016517


In [11]:
# Create the GridSearchCV model
param_grid = {'C': [1.0, 5.0, 10.0],
              'penalty': ["l1", "l2"]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [12]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1.0, penalty=l1 ...............................................
[CV] ................... C=1.0, penalty=l1, score=0.433, total=   0.1s
[CV] C=1.0, penalty=l1 ...............................................
[CV] ................... C=1.0, penalty=l1, score=0.453, total=   0.1s
[CV] C=1.0, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ................... C=1.0, penalty=l1, score=0.459, total=   0.1s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ................... C=1.0, penalty=l2, score=0.435, total=   0.1s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ................... C=1.0, penalty=l2, score=0.454, total=   0.1s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ................... C=1.0, penalty=l2, score=0.460, total=   0.1s
[CV] C=5.0, penalty=l1 ...............................................
[CV] ................... C=5.0, penalty=l1, score=0.435, total=   0.1s
[CV] C=5.0, penalty=l1 ...............................................
[CV] ................... C=5.0, penalty=l1, score=0.452, total=   0.1s
[CV] C=5.0, penalty=l1 ...............................................
[CV] ................... C=5.0, penalty=l1, score=0.458, total=   0.1s
[CV] C=5.0, penalty=l2 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    1.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1.0, 5.0, 10.0], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [13]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1.0, 'penalty': 'l2'}
0.44971570717839376
