In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance

In [3]:
data = pd.read_csv('cleaned_master_table_25025.csv')
data.head(10)

Unnamed: 0,cc_list_id,original_list_date,current_list_price,current_status,days_on_market,cc_property_id,address,city,zipcode,fips,...,heatcode,garageparkingnbr,storiesnbrcode,stylecode,sumresidentialunits,municipality,assdtotalvalue,assdlandvalue,assdimprovementvalue,taxamount
0,145525704,2020-07-07,474900.0,4,57.0,71297111,144 ORVIS RD,Revere,2151,25025,...,4.0,0,200.0,4.0,1,REVERE,379200,199500,179700,427000.0
1,56683068,2020-09-04,474900.0,4,3.0,56683068,144 ORVIS RD,Revere,2151,25025,...,4.0,0,200.0,4.0,1,REVERE,379200,199500,179700,427000.0
2,143501563,2020-06-03,625000.0,1,13.0,71406284,261 BEACON ST APT 31,Boston,2116,25025,...,10.0,0,100.0,16.0,48,BOSTON-CENTRAL BOSTON,494200,0,494200,531753.0
3,59170780,2020-06-09,625000.0,6,5.0,59170780,261 BEACON ST APT 31,Boston,2116,25025,...,10.0,0,100.0,16.0,48,BOSTON-CENTRAL BOSTON,494200,0,494200,531753.0
4,143424293,2020-06-01,2650000.0,2,,71408771,133 COMMONWEALTH AVE APT 6,Boston,2116,25025,...,10.0,1,200.0,16.0,10,BOSTON-CENTRAL BOSTON,2280300,0,2280300,2158567.0
5,59166887,2020-09-07,2650000.0,2,3.0,59166887,133 COMMONWEALTH AVE APT 6,Boston,2116,25025,...,10.0,1,200.0,16.0,10,BOSTON-CENTRAL BOSTON,2280300,0,2280300,2158567.0
6,143173016,2020-05-19,899000.0,1,17.0,71409047,184 MARLBOROUGH ST APT 6,Boston,2116,25025,...,4.0,1,100.0,16.0,7,BOSTON-CENTRAL BOSTON,780800,0,780800,840805.0
7,59166913,2020-06-01,899000.0,6,3.0,59166913,184 MARLBOROUGH ST APT 6,Boston,2116,25025,...,4.0,1,100.0,16.0,7,BOSTON-CENTRAL BOSTON,780800,0,780800,840805.0
8,59152815,2020-06-22,599981.0,6,5.0,59152815,347 W 2ND ST,South Boston,2127,25025,...,4.0,2,300.0,15.0,2,BOSTON-SOUTH BOSTON,550900,0,550900,302039.0
9,143114904,2020-05-16,599981.0,4,51.0,174998645,347 W 2ND ST,Boston,2127,25025,...,4.0,2,300.0,15.0,2,BOSTON-SOUTH BOSTON,550900,0,550900,302039.0


In [4]:
def transformer(df, feature_name):
    '''
    Params:
        df -> pandas dataframe: the original dataframe containing the categorical feature
        feature_name -> str: the categorical feature to be transformed
    Returns:
        transformed_feature -> pandas dataframe: one hot encoded feature
    '''
    transformed_feature = pd.get_dummies(df[feature_name], prefix=feature_name)
    return transformed_feature

In [5]:
def join(df, transformed_df):
    '''
    Params:
        df -> pandas dataframe: the original dataframe containing the categorical feature
        transformed_df -> pandas dataframe: one hot encoded dataframe
    Returns:
        joined_df -> pandas dataframe: joined dataframe of original dataframe and 
                                       the transformed dataframe on the categorical feature
    '''
    joined_df = df.join(transformed_df)
    return joined_df

In [6]:
def drop(df, cat_feature):
    '''
    Params:
        df -> pandas dataframe: the joined dataframe containing the categorical feature
        cat_feature -> str: the categorical feature to be dropped
    Returns:
        dropped_df -> pandas dataframe: the dataframe with the categorical feature dropped
    '''
    dropped_df = df.drop(columns=cat_feature)
    return dropped_df

In [7]:
# normalize the features
def normalize(df):
    '''
    Params:
        df -> pandas dataframe: the original dataframe for recommendation
    Returns:
        normalized_df -> pandas dataframe: the dataframe with normalized 
                                           features for recommendation
    '''
    try:
        columns = list(df.columns)
        normalized_df = df
        for column in columns:
            col = df_for_recom[column]
            normalized_df[column] = (col-col.min())/(col.max()-col.min())
        return normalized_df
    except ZeroDivisionError:
        return df

In [11]:
# selected features for recommendation:
# current_list_price, days_on_market, latitude, longitude, sqft, year_built, bedrooms_x,
# cooling, has_central_air, has_jacuzzi, has_pool, has_solar, has_garage,
# is_MULTI_FAMILY, is_CONDO, is_SINGLE, schooldistrictname, lotsizesqft, buildingarea,
# effectiveyearbuilt, totalrooms, bathtotalcalc, garageparkingnbr, 
# heatcode, stylecode, propertyclassid(groupby)

selected_features = ['current_list_price', 'days_on_market', 'latitude', 'longitude', 'sqft', \
                     'year_built', 'bedrooms_x', 'cooling', 'has_central_air', 'has_jacuzzi', \
                     'has_pool', 'has_solar', 'has_garage', 'is_MULTI_FAMILY', 'is_CONDO', \
                     'is_SINGLE', 'schooldistrictname', 'lotsizesqft', 'buildingarea', \
                     'effectiveyearbuilt', 'totalrooms', 'bathtotalcalc', 'garageparkingnbr', \
                     'heatcode', 'stylecode', 'propertyclassid']

cat_features = ['cooling','schooldistrictname','heatcode','stylecode']


df_for_recom = data[selected_features]

# join the one-hot encoded features dataframe with original one
for feature in cat_features:
    transformed_df = transformer(df_for_recom, feature)
    df_for_recom = join(df_for_recom, transformed_df)

# drop the original categorical features
for feature in cat_features:
    df_for_recom = drop(df_for_recom, feature)

# group the dataframe by propertyclassid and transform into dictionary (key: propertyclassid, value: df)
df_for_recom_dict = {}
keys = data.propertyclassid.unique()
for key in keys:
    key_df = df_for_recom[df_for_recom['propertyclassid']==key]
    dropped_key_df = drop(key_df, 'propertyclassid')

    # for each dataframe, normalize the features
    norm_key_df = normalize(dropped_key_df)
    df_for_recom_dict[key] = norm_key_df

# distance metric: cosine similarity or euclidean distance
# euclidean distance: dst = distance.euclidean(a, b)

In [12]:
df_for_recom

Unnamed: 0,current_list_price,days_on_market,latitude,longitude,sqft,year_built,bedrooms_x,has_central_air,has_jacuzzi,has_pool,...,stylecode_16.0,stylecode_19.0,stylecode_21.0,stylecode_23.0,stylecode_24.0,stylecode_29.0,stylecode_31.0,stylecode_35.0,stylecode_36.0,stylecode_41.0
0,474900.0,57.0,42.425470,-71.013550,1486.0,1954.0,3.0,True,False,False,...,0,0,0,0,0,0,0,0,0,0
1,474900.0,3.0,42.425475,-71.013551,1486.0,1954.0,3.0,True,False,False,...,0,0,0,0,0,0,0,0,0,0
2,625000.0,13.0,42.353260,-71.079510,535.0,1886.0,1.0,False,False,False,...,1,0,0,0,0,0,0,0,0,0
3,625000.0,5.0,42.353262,-71.079518,535.0,1886.0,1.0,False,False,False,...,1,0,0,0,0,0,0,0,0,0
4,2650000.0,,42.352410,-71.077960,1994.0,1899.0,2.0,True,False,False,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,615000.0,,42.339816,-71.080479,,,1.0,False,False,False,...,1,0,0,0,0,0,0,0,0,0
8374,489000.0,22.0,42.343720,-71.096190,475.0,1900.0,1.0,False,False,False,...,1,0,0,0,0,0,0,0,0,0
8375,975000.0,14.0,42.342040,-71.071540,765.0,1890.0,2.0,True,False,False,...,1,0,0,0,0,0,0,0,0,0
8376,1375000.0,18.0,42.341300,-71.078510,1270.0,1890.0,2.0,False,False,False,...,1,0,0,0,0,0,0,0,0,0


In [13]:
# example of dataframe by property class id
df_for_recom_dict['R']

Unnamed: 0,current_list_price,days_on_market,latitude,longitude,sqft,year_built,bedrooms_x,has_central_air,has_jacuzzi,has_pool,...,stylecode_16.0,stylecode_19.0,stylecode_21.0,stylecode_23.0,stylecode_24.0,stylecode_29.0,stylecode_31.0,stylecode_35.0,stylecode_36.0,stylecode_41.0
0,0.021090,0.038133,0.997138,0.001651,0.012272,0.736,0.125000,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.021090,0.002630,0.997139,0.001651,0.012272,0.736,0.125000,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.027762,0.009204,0.986039,0.001318,0.002737,0.464,0.041667,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.027762,0.003945,0.986040,0.001318,0.002737,0.464,0.041667,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.117763,,0.985909,0.001326,0.017366,0.516,0.083333,1,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,0.027317,,0.983973,0.001314,,,0.041667,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8374,0.021717,0.015122,0.984573,0.001234,0.002136,0.520,0.041667,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8375,0.043317,0.009862,0.984315,0.001359,0.005043,0.480,0.083333,1,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8376,0.061095,0.012492,0.984201,0.001323,0.010107,0.480,0.083333,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# index record
indices = pd.Series(data.index, index=data['cc_list_id'])

In [15]:
def get_similar_home(listingID, new, orig=data, n_recommendation=5):
    '''
    A method that returns the most similar listings to the current list

    Params:
        listingID -> int: cc_listing_id of the original dataset
        new -> pandas dataframe: the prepared dataset
        orig -> pandas dataframe: the original dataset

    Return:
        The top N (determined by n_recommendation) most similar homes
    '''
    # get the index of the requested listing ID
    idx = indices[listingID]
    new.fillna(0, inplace=True)

    # calculate the cosine similarity matrix
    similar = cosine_similarity(new.values)
    # print(similar)

    # put the correlation back into the original dataset
    new['correlation'] = similar[idx]
    corr_similar = pd.DataFrame(new)

    # sort the rows by the correlation score
    result = corr_similar.sort_values(by='correlation', ascending=False)
    corr_similar.dropna(inplace=True)

    # get the top N rows
    top = result.head(n_recommendation)
    top_index = list(top.index)

    # return the most similar rows in the original dataset
    return orig.loc[ top_index , : ], top
    # return result.head(n_recommendation)


In [16]:
# Example
recommendations = get_similar_home(listingID =56683068, new=df_for_recom_dict['R'], n_recommendation=10)
pd.set_option('display.max_columns', None)
print(recommendations)

(      cc_list_id original_list_date  current_list_price  current_status  \
1       56683068         2020-09-04            474900.0               4   
0      145525704         2020-07-07            474900.0               4   
5405    56683877         2020-02-26            449900.0               6   
5404   137981366         2020-04-17            449900.0               1   
3619    56683070         2020-02-07            469900.0               6   
7673    56679781         2020-05-01            535000.0               6   
7674   142455999         2020-04-26            535000.0               4   
5455   145010828         2020-06-25            499900.0               1   
3638   123562695         2019-11-15            889000.0               6   
3624    56683677         2020-04-06            600000.0               6   

      days_on_market  cc_property_id            address    city  zipcode  \
1                3.0        56683068       144 ORVIS RD  Revere     2151   
0               57.0 