In [None]:
import pandas as pd
import requests
from time import sleep
import os
import json
from geopy.distance import geodesic
import sys
import pickle
import numpy as np
import torch
from sklearn.preprocessing import MultiLabelBinarizer

import numpy as np
import torch
import torch.nn.functional as F

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile

from scipy import spatial
from tqdm import tqdm




In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

folder_path = '/content/drive/MyDrive/ml_proj/'

data_path = folder_path + 'data/'



In [None]:
listings = pd.read_csv(folder_path + 'data/listings.csv')


In [None]:
cols = [
    'property_type',
    'description',
    'room_type',
    'accommodates',
    'bedrooms',
    'price',
    'number_of_reviews',
    'review_scores_rating',
    'calculated_host_listings_count',
    'availability_30',
    'minimum_nights',
    'latitude',
    'longitude'
]

listings_ = listings[cols]

listings_.dtypes

In [None]:
documents = [document.split(' ') if type(document) is str else [] for document in listings.description.values]
descriptions_embeddings = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
m = Doc2Vec(descriptions_embeddings, vector_size=100, window=3, min_count=2, workers=4)
fname = get_tmpfile("doc2vec_model")
m.save(fname)

doc_inference = []
for i, document in enumerate(tqdm(documents)):
    doc_inference.append(m.infer_vector(document).tolist())


In [None]:
pickle.dump(doc_inference, open(folder_path + 'data/doc_inference.pkl', 'wb'))

In [None]:
for col in listings_.columns:
    print(col + ', Number of Missing Values:', len(listings_[col][listings_[col].isnull()]))


In [None]:
# remove NaN values from dataframe
original = len(listings_)
listings_['bedrooms'] = listings_['bedrooms'].fillna(value=listings_['bedrooms'].mean())
listings_['review_scores_rating'] = listings_['review_scores_rating'].fillna(value=listings_['review_scores_rating'].mean())


# convert formatting for price
listings_['price'] = (listings_['price'].str.replace(r'[^-+\d.]', '').astype(float))

# drop any inconsistent values
print('Number of Accommodates 0:', len(listings_[listings_['accommodates'] == 0]))
print('Number of Bedrooms 0:', len(listings_[listings_['bedrooms'] == 0]))
print('Number of Listings with Price $0.00:', len(listings_[listings_['price'] == 0.00]))

listings_ = listings_[listings_['accommodates'] != 0]
listings_ = listings_[listings_['bedrooms'] != 0]


In [None]:
import geopandas as gpd
import pandas as pd

districts_shapefile_path = folder_path + 'data/geo_export_92c9a04c-c163-45b1-8509-f021419a9c2f.shp'
chicago_districts = gpd.read_file(districts_shapefile_path)

geometry = gpd.points_from_xy(listings['longitude'], listings['latitude'])
gdf = gpd.GeoDataFrame(listings, geometry=geometry)

result = gpd.sjoin(gdf, chicago_districts, how='left', op='within')

listings_['district'] = result['pri_neigh'].tolist()


In [None]:
categorical_columns = ['property_type', 'room_type', 'district']
for c in categorical_columns:
    listings_[c] = listings_[c].astype('category')
    listings_[c] = listings_[c].cat.codes
    listings_[c] = listings_[c].astype('int')

In [None]:
listings_.to_csv(folder_path + 'data/listings_clean.csv', index=False)