In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('2.linear_regression_models/AB_NYC_2019.csv',
            nrows=3000)

In [4]:
df.shape

(3000, 16)

In [5]:
df.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365


In [6]:
df.neighbourhood_group.value_counts()

Manhattan        1373
Brooklyn         1370
Queens            199
Bronx              33
Staten Island      25
Name: neighbourhood_group, dtype: int64

# Keep only three top values.
# We'll try to have multiclass model

In [7]:
groups = ['Manhattan', 'Brooklyn', 'Queens']
df = df[df.neighbourhood_group.isin(groups)]

In [8]:
df = df.reset_index(drop=True)

In [9]:
df.shape

(2942, 16)

In [10]:
X = df[['latitude', 'longitude']].values

In [12]:
y = df.neighbourhood_group.values

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
lr = LogisticRegression()
lr.fit(X, y)

LogisticRegression()

In [15]:
lr.coef_

array([[-12.96053022,  -7.14091411],
       [  7.34784065,   4.03172651],
       [  5.61268957,   3.1091876 ]])

In [16]:
lr.intercept_
# same here as instead of 1 bias we have 3

array([ 0.28193733, -0.40945164,  0.12751432])

In [18]:
lr.predict(X)

array(['Brooklyn', 'Manhattan', 'Manhattan', ..., 'Brooklyn', 'Brooklyn',
       'Manhattan'], dtype=object)

In [17]:
lr.predict_proba(X)

array([[0.81959348, 0.15370195, 0.02670457],
       [0.37977902, 0.54118205, 0.07903893],
       [0.11232869, 0.78719635, 0.10047495],
       ...,
       [0.52672482, 0.41022042, 0.06305476],
       [0.47370981, 0.4572758 , 0.06901439],
       [0.13015528, 0.77011504, 0.09972968]])

In [19]:
(lr.predict(X) == y).mean()

0.6801495581237254

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

In [27]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X, y)

DecisionTreeClassifier(max_depth=3)

In [28]:
(dt.predict(X) == y).mean()

0.9112848402447314

In [31]:
print(export_text(dt, feature_names=['latitude', 'longitude']))

|--- latitude <= 40.72
|   |--- longitude <= -73.86
|   |   |--- longitude <= -73.99
|   |   |   |--- class: Brooklyn
|   |   |--- longitude >  -73.99
|   |   |   |--- class: Brooklyn
|   |--- longitude >  -73.86
|   |   |--- class: Queens
|--- latitude >  40.72
|   |--- longitude <= -73.93
|   |   |--- longitude <= -73.96
|   |   |   |--- class: Manhattan
|   |   |--- longitude >  -73.96
|   |   |   |--- class: Manhattan
|   |--- longitude >  -73.93
|   |   |--- latitude <= 40.82
|   |   |   |--- class: Queens
|   |   |--- latitude >  40.82
|   |   |   |--- class: Manhattan



# Feature importance for continuous target (regression)

In [36]:
list(df.dtypes[df.dtypes != 'object'].index)

['id',
 'host_id',
 'latitude',
 'longitude',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [37]:
numeric = [
     'host_id',
     'latitude',
     'longitude',
     'minimum_nights',
     'number_of_reviews',
     'reviews_per_month',
     'calculated_host_listings_count',
     'availability_365'
]

In [42]:
df[numeric].corrwith(np.log1p(df.price)).abs()

host_id                           0.045893
latitude                          0.018935
longitude                         0.364144
minimum_nights                    0.012581
number_of_reviews                 0.095637
reviews_per_month                 0.098535
calculated_host_listings_count    0.000751
availability_365                  0.002821
dtype: float64

In [43]:
pd.cut(df.price, bins=10)

0         (5.01, 509.0]
1         (5.01, 509.0]
2         (5.01, 509.0]
3         (5.01, 509.0]
4         (5.01, 509.0]
             ...       
2937      (5.01, 509.0]
2938      (5.01, 509.0]
2939      (5.01, 509.0]
2940    (509.0, 1008.0]
2941      (5.01, 509.0]
Name: price, Length: 2942, dtype: category
Categories (10, interval[float64]): [(5.01, 509.0] < (509.0, 1008.0] < (1008.0, 1507.0] < (1507.0, 2006.0] ... (3004.0, 3503.0] < (3503.0, 4002.0] < (4002.0, 4501.0] < (4501.0, 5000.0]]

In [49]:
pd.cut(np.log1p(df.price), bins=10).value_counts()

(4.234, 4.846]    1124
(4.846, 5.458]     991
(3.622, 4.234]     374
(5.458, 6.07]      339
(6.07, 6.682]       62
(3.01, 3.622]       24
(6.682, 7.293]      19
(7.293, 7.905]       5
(7.905, 8.517]       3
(2.392, 3.01]        1
Name: price, dtype: int64

![title](price_cut.jpg)

But we need bins of **equal** size

In [76]:
qprice = pd.qcut(df.price, q=10)
# Pay attention how long tail (as we don't have log applied)
# gets into 1 bin

In [77]:
qprice

0        (147.6, 175.0]
1        (200.0, 269.0]
2        (147.6, 175.0]
3          (75.0, 90.0]
4          (75.0, 90.0]
             ...       
2937     (104.0, 125.0]
2938     (147.6, 175.0]
2939     (200.0, 269.0]
2940    (269.0, 5000.0]
2941       (75.0, 90.0]
Name: price, Length: 2942, dtype: category
Categories (10, interval[float64]): [(9.999, 60.0] < (60.0, 75.0] < (75.0, 90.0] < (90.0, 104.0] ... (147.6, 175.0] < (175.0, 200.0] < (200.0, 269.0] < (269.0, 5000.0]]

In [78]:
from sklearn.metrics import mutual_info_score

In [81]:
mutual_info_score(df.neighbourhood, qprice)

0.2796110818606619

In [82]:
mutual_info_score(df.room_type, qprice)

0.2464904922022662

# How to work with text variables

In [84]:
df.name.head(3)

0     Clean & quiet apt home by the park
1                  Skylit Midtown Castle
2    THE VILLAGE OF HARLEM....NEW YORK !
Name: name, dtype: object

In [85]:
names = df.name.iloc[:3]

In [86]:
from sklearn.feature_extraction.text import CountVectorizer

In [87]:
cv = CountVectorizer()

In [88]:
cv.fit(names)

CountVectorizer()

In [90]:
X = cv.transform(names)

In [91]:
X.toarray()

array([[1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1]])

In [92]:
cv.get_feature_names()
# every col is whether it matches the word.

['apt',
 'by',
 'castle',
 'clean',
 'harlem',
 'home',
 'midtown',
 'new',
 'of',
 'park',
 'quiet',
 'skylit',
 'the',
 'village',
 'york']

In [93]:
pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

Unnamed: 0,apt,by,castle,clean,harlem,home,midtown,new,of,park,quiet,skylit,the,village,york
0,1,1,0,1,0,1,0,0,0,1,1,0,1,0,0
1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,1,1,0,0,0,1,1,1
