In [None]:
import random as rand
import pandas as pd

#Fix the randomness
seed = 1234
rand.seed(seed)

import pandas as pd
import random

def load_data(input_file):
    data = pd.read_csv(input_file)
    return data

def split_data(input_file, output_file_1, output_file_2, ratio_split=0.2):
    df = pd.read_csv(input_file)

    # Remove columns: date, country, street (Since every data is collected inside USA)
    df = df.drop(columns=["date", "country", "street"])

    # 1. Filter noisy data (house with prices == 0.0)
    df = df[df["price"] > 100]

    # 2. Remove the records belong to the cities that has less than 5 instances
    city_counts = df['city'].value_counts()
    
    cities_to_keep = city_counts[city_counts >= 5].index
    df = df[df['city'].isin(cities_to_keep)]


    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    total_rows = len(df)
    split_index = int(total_rows * (1 - ratio_split))

    set_1 = df.iloc[:split_index, :]
    set_2 = df.iloc[split_index:, :]

    set_1.to_csv(output_file_1, index=False, header = True)
    set_2.to_csv(output_file_2, index=False, header = True)
    return None

## 1. Create the database.txt file and evalutation.txt

In [None]:
input_file_path = 'data.csv'
output_file_1_path = 'database.csv'
output_file_2_path = 'evaluation.csv'
split_ratio = 0.2

split_data(input_file_path, output_file_1_path, output_file_2_path, split_ratio)

## 2. Load the dataset 

In [None]:
database = load_data('database.csv')
# #Normalize the data:
# database=(database-database.min())/(database.max()-database.min())

evaluation = load_data('evaluation.csv')

#Create a dictionary for weights:
weight_dist = {}
for key in list(database.columns):
    if key not in ["city", "statezip", "price"]:
        weight_dist[key] = 1.0

#Pick a random sample from database
sample = database.iloc[2828]

In [None]:
print(database.head(4))
print("---------------------------------")
print(f"A sample : {sample}")
print("---------------------------------")
print(f"The weight dictionary : {weight_dist}")

In [None]:
communities = database.groupby(['city', 'statezip']).size().reset_index(name='numberOfInstances')
print(communities) #102 communities

In [None]:
communities_dict = {}
for city, city_group in database.groupby('city'):
    city_dict = {}
    for (zipcode, zip_group) in city_group.groupby('statezip'):
        city_dict[zipcode] = zip_group.drop(columns=['city', 'statezip'])
    communities_dict[city] = city_dict

# Testing
print(communities_dict["Algona"]["WA 98001"])

In [None]:
def find_community(sample, communities_dict, neighborhood_mini=5):
    #Return a dataframe
    sample_zipcode = sample['statezip']
    sample_city = sample['city']
    closest_zone_samples = communities_dict[sample_city][sample_zipcode]
    if(len(closest_zone_samples) > neighborhood_mini):
        return closest_zone_samples
    else:
        closest_zone_samples = pd.DataFrame()
        for statezip in communities_dict[sample_city]:
            closest_zone_samples = pd.concat([closest_zone_samples, communities_dict[sample_city][statezip]])
        return closest_zone_samples

def find_k_neighbors(sample, database, k=5):
    closest_zone_samples = find_community(sample, database)
    samples_distance = []
    for neighbor in closest_zone_samples:
        samples_distance.append(cal_distance(sample, neighbor))
    samples_distance.sort()
    return samples_distance[:k]

def cal_distance(sample, another_sample, weight_dist = weight_dist):
    difference = 0.0
    for k,v in weight_dist.items():
        difference += v * (sample[k] - another_sample[k])

    return difference

def adapt_price_from_dissimilarities(sample, neighbor, weight_dict):
    init_price = neighbor['price']
    square_meter_price = init_price / neighbor['sqft_living']
    for key in weight_dict:
        if key == 'price':
            init_price += square_meter_price * (sample['sqft_living'] - neighbor['sqft_living'])
    return init_price

In [None]:
find_community(sample, communities_dict, neighborhood_mini=3)

In [None]:
print(cal_distance(database.iloc[1], database.iloc[20]))