- KNN
- Scikit-learn pipelines

In [115]:
import pandas as pd
import numpy as np

In [116]:
df = pd.read_csv('2.linear_regression_models/AB_NYC_2019.csv',
            nrows=4000)

# KNN

In [117]:
from sklearn.neighbors import KNeighborsRegressor

In [118]:
from sklearn.model_selection import train_test_split

In [119]:
df_train, df_valid = train_test_split(df)

In [120]:
len(df_train)

3000

In [121]:
X_train = df_train[['latitude', 'longitude']].fillna(0).values
y_train = df_train.price.values

In [122]:
X_valid = df_valid[['latitude', 'longitude']].fillna(0).values
y_valid = df_valid.price.values

In [123]:
knn = KNeighborsRegressor(n_neighbors=5)

In [124]:
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [125]:
y_pred = knn.predict(X_valid)

In [126]:
for i in range(1, 50):
    knn = KNeighborsRegressor(n_neighbors=5)
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_valid)
    # rmse

# Pipelines

In [127]:
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

In [128]:
df.name = df.name.fillna('')

for c in ['neighbourhood', 'neighbourhood_group', 'room_type']:
    df[c] = df[c].str.lower().str.replace(' ', '_').fillna('NA')

In [129]:
from sklearn.compose import ColumnTransformer # hstack
from sklearn.feature_extraction.text import CountVectorizer # for text
from sklearn.preprocessing import OneHotEncoder # for categorical

In [130]:
numerical = [
    'latitude',
    'longitude',
    'price',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]

Below we'll use CountVectorizer instead of OneHotEncoder
as the former allows us to specify the min amount of times feature can appear to be considered a feature

<h3>Hence we # OneHotEncoder</h3> 

In [131]:
# name of the transformer, actual transformer, list of categories
# passthrough means don't do anything

# '.*' means the whole string
transformations = [
    ('numerical', 'passthrough', numerical),
    # ('categories', OneHotEncoder(dtype='int32'), ['neighbourhood_group', 'neighbourhood', 'room_type']),
    
    ('ng', CountVectorizer(token_pattern='.*', min_df=100, dtype='int32'), 'neighbourhood_group'),
    ('n', CountVectorizer(token_pattern='.*', min_df=50, dtype='int32'),'neighbourhood'),
    ('r', CountVectorizer(token_pattern='.*', min_df=100, dtype='int32'), 'room_type'),
    
    ('name', CountVectorizer(min_df=100, dtype='int32'), 'name')
]

In [132]:
col_transform = ColumnTransformer(
    transformations,
    remainder='drop'
)
# "drop" means remove all the other features

In [133]:
col_transform.fit(df)

ColumnTransformer(transformers=[('numerical', 'passthrough',
                                 ['latitude', 'longitude', 'price',
                                  'minimum_nights', 'number_of_reviews',
                                  'reviews_per_month',
                                  'calculated_host_listings_count',
                                  'availability_365']),
                                ('ng',
                                 CountVectorizer(dtype='int32', min_df=100,
                                                 token_pattern='.*'),
                                 'neighbourhood_group'),
                                ('n',
                                 CountVectorizer(dtype='int32', min_df=50,
                                                 token_pattern='.*'),
                                 'neighbourhood'),
                                ('r',
                                 CountVectorizer(dtype='int32', min_df=100,
                             

In [134]:
X = col_transform.transform(df)

In [135]:
col_transform.get_feature_names()[::10]
# numerical, categorical, text features

['latitude',
 'ng__manhattan',
 'n__east_village',
 'n__upper_west_side',
 'name__apt',
 'name__cozy',
 'name__in',
 'name__of',
 'name__village']

In [136]:
X

<4000x84 sparse matrix of type '<class 'numpy.float64'>'
	with 64256 stored elements in Compressed Sparse Row format>

In [137]:
y = df.price.values

In [138]:
from sklearn.linear_model import LinearRegression

In [139]:
lr = LinearRegression()
lr.fit(X, y)

LinearRegression()

In [140]:
from sklearn.pipeline import Pipeline

In [141]:
pipeline = Pipeline([
    ('transform', col_transform),
    ('lr', LinearRegression())
])

In [142]:
pipeline.fit(df, df.price.values)

Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['latitude', 'longitude',
                                                   'price', 'minimum_nights',
                                                   'number_of_reviews',
                                                   'reviews_per_month',
                                                   'calculated_host_listings_count',
                                                   'availability_365']),
                                                 ('ng',
                                                  CountVectorizer(dtype='int32',
                                                                  min_df=100,
                                                                  token_pattern='.*'),
                                                  'neighbourhood_group'),
                                                 ('n',
     

In [143]:
pipeline.predict(df)

array([149.00000008, 225.00000888, 149.99999557, ...,  49.00000071,
        62.99999315, 100.00000335])

# Custom transformer inside Pipeline

In [144]:
from sklearn.base import TransformerMixin

In [145]:
class ConcatenteTransformer(TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        columns = list(X.columns)
        
        res = ''
        
        for c in columns:
            res = res + ' ' + c + '=' + X[c]
        
        return res.str.strip()

In [146]:
ct = ConcatenteTransformer()

In [147]:
p2 = Pipeline([
    ('concatenate', ConcatenteTransformer()),
    ('vectorize', CountVectorizer(token_pattern='\S+', min_df=100)) 
])

In [148]:
p2.fit_transform(df[['neighbourhood_group', 'neighbourhood', 'room_type']])

<4000x17 sparse matrix of type '<class 'numpy.int64'>'
	with 10165 stored elements in Compressed Sparse Row format>

# Put inside our transformation

In [149]:
# here we use our custom pipeline instead of 3 separate
# rows that you can see in the previous transformations

transformations = [
    ('numerical', 'passthrough', numerical),
    
    ('categories', Pipeline([
        ('concatenate', ConcatenteTransformer()),
        ('vectorize', CountVectorizer(token_pattern='\S+', min_df=100)) 
]), ['neighbourhood_group', 'neighbourhood', 'room_type']),

    ('name', CountVectorizer(min_df=100, dtype='int32'), 'name')
]

In [150]:
col_transform = ColumnTransformer(
    transformations,
    remainder='drop'
)

In [151]:
pipeline = Pipeline([
    ('transformer', col_transform),
    ('lr', LinearRegression())
])

In [152]:
pipeline.fit(df, df.price.values)
pipeline.predict(df)

array([149.00000071, 225.00000739, 149.99998623, ...,  49.00001048,
        63.00000505,  99.99999369])

In [153]:
import pickle

In [154]:
with open('pipeline.bin', 'wb') as f_out:
    pickle.dump(pipeline, f_out)