In [31]:
import copy
import pandas as pd
import numpy as np
import feather
import pickle

from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 100)

In [4]:
DATA_PATH = '../data/'
MODELS_PATH = '../models/'
df =  feather.read_dataframe(DATA_PATH +'housing_clean')
df['text_len'] = df.text.str.len()

Alpaca seems to be the most managed group, without much noise

In [5]:
alpaca_df = df[df.group =='https://www.facebook.com/groups/673389662794979/']

In [6]:
supply = alpaca_df[(alpaca_df.write_to_seller == True) | (alpaca_df.was_link == True) | (alpaca_df.was_photo == True)]

In [7]:
print(f'total posts - {alpaca_df.shape[0]}')
print(f'supposedly supply - {supply.shape[0]}')

total posts - 1181
supposedly supply - 557


If there are photos or links (craiglist) or "Message Seller" we'll count it as "supply"

In [8]:
alpaca_df[(alpaca_df.write_to_seller == True) | (alpaca_df.was_link == True) | (alpaca_df.was_photo == True)].shape

(557, 16)

In [9]:
alpaca_df.loc[(alpaca_df.write_to_seller == True) | (alpaca_df.was_link == True) | (alpaca_df.was_photo == True),'supply'] = True
alpaca_df.loc[alpaca_df.supply != True, 'supply'] = False

In [10]:
alpaca_df[alpaca_df.supply == False].text.sample(10)

2429    we rent an apartment in lawrence next to andover . more information by inbox . se renta un apart...
2849    hello everyone ! i ’ m looking for an apartment in or around boston for january 1st . my budget ...
370     i ’ m looking to rent a room starting february / march preferably in allston , brighton , or bro...
2512    hi , i ’ m a medical student looking for a room near brigham for the month of january . please l...
2902    hey looking for a private or shared room with a girl near beth israel deaconess medical center h...
781     looking for a room with 1 - 3 roommates preferably in or around the fenway - kenmore , columbus ...
1742    hello ! i am a 23y / o female searching for a private room in a shared apt . with a lease starti...
1945    hi all , i ' m looking for a private room sublet with washer / dryer in unit and a maximum of 1 ...
84      hello , i am looking for a place to stay from feb 1st till may 15th 2019 . preferably a one bedr...
1600    hi ! my name is heat

In [11]:
alpaca_df[alpaca_df.supply == False].text.str.len().mean()

349.9519230769231

In [12]:
alpaca_df[alpaca_df.supply == True].text.str.len().mean()

633.8509874326751

In [13]:
alpaca_df['text_len'] = alpaca_df.text.str.len()

In [14]:
cv_text = CountVectorizer(ngram_range=(1, 4), min_df=5)
tfidf = TfidfTransformer()

text_len_scaler = StandardScaler()
all_text = list(alpaca_df['text'].values)
all_text = cv_text.fit_transform(all_text)
all_text = tfidf.fit_transform(all_text)
scaled_len = text_len_scaler.fit_transform(alpaca_df['text_len'].values.reshape(-1, 1))

In [15]:
X = hstack([all_text, scaled_len]).tocsr()
y = alpaca_df.supply

train_part_size = int(0.7 * y.shape[0])
X_train = X[:train_part_size, :]
y_train = y[:train_part_size]
X_valid =  X[train_part_size:, :]
y_valid = y[train_part_size:]
df_train = alpaca_df[:train_part_size]
df_valid = alpaca_df[train_part_size:]

In [16]:
model = SGDClassifier(random_state=17)
model.fit(X_train, y_train)
model_test_pred = model.predict(X_valid)
accuracy_score(model_test_pred, y_valid)

0.8873239436619719

In [17]:
df_valid[y_valid != model_test_pred][['text', 'supply']]

Unnamed: 0,text,supply
2310,"hey ! looking for a roommate in a three bedroom apartment in oak square , brighton , ma . we are...",False
2401,"hello ! i know this is super last minute , but i am looking for someone to take over my lease fr...",True
2422,hello all ! i am looking for a roommate to fill a spot in cambridge ! please reach out to me dir...,False
2428,looking for someone to * sublease or take over lease * starting january or february 2019 ! $ 830...,True
2431,"hi , i am a graduate student in longwood looking for a tidy , respectful roommate starting july ...",True
2450,hi this isn ’ t regarding apartments but selling 2 snoop dog tix for tomorrow 1 / 4 at the grand...,True
2547,subletting my room in a 4bed 2 . 5bath house . it ' s 2 min walk to forest hills t stop . rent i...,True
2552,"looking for one occupant for a 1bhk ( 770 sqft ) at st . germain street , boston until 30th june...",False
2555,"looking for one occupant for a 1bhk ( 770 sqft ) at st . germain street , boston until 30th june...",False
2592,"looking for one occupant for a 1bhk ( 770 sqft ) at st . germain street , boston until 30th june...",False


In [18]:
model_test_pred = model.predict(X_train)
print(accuracy_score(model_test_pred, y_train))
df_train[y_train != model_test_pred][['text', 'supply']]

0.9891041162227603


Unnamed: 0,text,supply
236,"hey everyone , i ' m looking for a roommate in my 2 - bedroom belmont apartment . the room is av...",True
692,hello ! i have one bedroom available in 3bedroom 1bathroom apartment in dorchester . the apartme...,False
1011,,True
1444,"hello , we have a fully furnished room available for short term rent ( 1 month to 8 months ) in ...",True
1452,,True
1706,,True
1816,"there - looking for a roommate in a three bedroom apartment in oak square , brighton , ma . we a...",False
2202,newton 近绿线d 地铁站房间出租从2 ⃣️🈷️ 份开始 ， 全美最安全的区之一 ， 步行2 ⃣️ 分钟到地铁站 ， 5 ⃣️ 分钟内各种风味美食 ， 邮局银行 ， 24小时7 / 11 ...,True
2281,hi everyone ! i have worked my butt off to save up money for a trip to europe with my best frien...,False


We clearly see that we got decent amount of predictions that were "misclassified" but actually correct. At this point we have to do some manual dataset preparation with support of this "errors".

Train on whole ds.

In [19]:
model = SGDClassifier(random_state=17)
model.fit(X, y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=17, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [20]:
boston_apartments_df = df[df.group =='https://www.facebook.com/groups/1210575355774169/']
harvard_apartments_df = df[df.group =='https://www.facebook.com/groups/735597296550141/']

In [21]:
X.shape

(1181, 6536)

In [22]:
boston_apartments_all_text = list(boston_apartments_df['text'].values)
boston_apartments_all_text = cv_text.transform(boston_apartments_all_text)
boston_apartments_all_text = tfidf.transform(boston_apartments_all_text)
boston_apartments_scaled_len = text_len_scaler.transform(boston_apartments_df['text_len'].values.reshape(-1, 1))

X = hstack([boston_apartments_all_text, boston_apartments_scaled_len]).tocsr()
boston_apartments_preds = model.predict(X)
boston_apartments_df['supply'] = boston_apartments_preds

In [23]:
harvard_apartments_all_text = list(harvard_apartments_df['text'].values)
harvard_apartments_all_text = cv_text.transform(harvard_apartments_all_text)
harvard_apartments_all_text = tfidf.transform(harvard_apartments_all_text)
harvard_apartments_scaled_len = text_len_scaler.transform(harvard_apartments_df['text_len'].values.reshape(-1, 1))

X = hstack([harvard_apartments_all_text, harvard_apartments_scaled_len]).tocsr()
harvard_apartments_preds = model.predict(X)
harvard_apartments_df['supply'] = harvard_apartments_preds

In [24]:
boston_apartments_df.supply.value_counts()

True     835
False    431
Name: supply, dtype: int64

In [25]:
harvard_apartments_df.supply.value_counts()

True     894
False    352
Name: supply, dtype: int64

In [26]:
alpaca_df.supply.value_counts()

False    624
True     557
Name: supply, dtype: int64

New Groups have skew to supply and it's true indeed if we would look at them.

In [27]:
df = pd.concat([alpaca_df,harvard_apartments_df,boston_apartments_df], axis=0)

In [28]:
df.reset_index(inplace=True)
df.to_feather(DATA_PATH + 'housing_with_supply')

In [34]:
with open(MODELS_PATH + 'housing_demand_classifier.pkl', 'wb') as f_id:
    pickle.dump(model, f_id)    