# Machine Learning Project Class LB01 By:
- Bryan Leonardo 22017645004
- Rio Nagano 2201767232
- Ronaldo Kenny Chandra 2201763234

In [0]:
import pandas as pd
import numpy as np
 
from sklearn.preprocessing import MinMaxScaler as mms, OrdinalEncoder as ore
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

# **Unsupervised K-Nearest Neighbors**

In [0]:
# randomize everytime the program is compiled
np.random.seed(0)

# get dataset
dataset = pd.read_csv("cars.csv")
# dataset = dataset[:5000]

# remove unwanted features (number_of_photos)
dataset = dataset[['manufacturer_name', 'model_name', 'transmission', 'color',
       'odometer_value', 'year_produced', 'engine_fuel', 'engine_has_gas',
       'engine_type', 'engine_capacity', 'body_type', 'has_warranty', 'state',
       'drivetrain', 'price_usd', 'is_exchangeable', 'location_region',
       'up_counter', 'feature_0', 'feature_1', 'feature_2',
       'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7',
       'feature_8', 'feature_9', 'duration_listed']]
# print(dataset.isnull().sum())

# found null value in engine_capacity of electric engine_type, which has no engine_capacity, set null value to 0
dataset = dataset.fillna(0)

# randomize data and reset its indices
dataset = dataset.sample(frac = 1).reset_index(drop = True)

# dataset has both string and numeric features, split them first
# string features encoded to numeric values
string_features = ore().fit_transform(dataset[["manufacturer_name" ,"model_name", "transmission", 
                                                "engine_fuel", "engine_has_gas", "engine_type", "body_type", 
                                                "has_warranty", "state", "drivetrain", "is_exchangeable"]])

# numeric features
number_features = dataset[["odometer_value", "year_produced", "engine_capacity", "price_usd", "up_counter", "duration_listed"]]

# turn encoded string features back into dataframe to be concatenated with numeric features
string_features = pd.DataFrame(string_features, columns = ["manufacturer_name" ,"model_name", "transmission", 
                                                            "engine_fuel", "engine_has_gas", "engine_type", "body_type", 
                                                            "has_warranty", "state", "drivetrain", "is_exchangeable"])

# concatenate both dataframes into new dataframe
new_data = pd.concat([string_features, number_features], axis = 1)

# get data for K-Means Clustering
cluster_data = new_data[:]

In [0]:
# Data Normalization with MinMaxScaler
nn_data = mms().fit_transform(new_data)

# Dimension Reduction with Principal Component Analysis
nn_data = PCA(n_components=10).fit_transform(nn_data)

In [0]:
# create NearestNeigbors learning model and fit the data
nn = NearestNeighbors(n_neighbors = 10).fit(nn_data)

# 'and' and 'or' operator in lists and dataframes are different, 'and' is normal 'and' operator in lists and is bitwise in dataframes, 
#                                                             and '&' is normal 'and' operator in dataframes and is bitwise in lists, 
#                                                             the same goes for 'or' operator and '|' operator
nn_query = nn_data[(dataset.manufacturer_name == "Subaru") & (dataset.engine_capacity == 3) & (dataset.price_usd >= 1000)]
nn_query = nn_data if len(nn_query) == 0 else nn_query

if(len(nn_query) == 0):
  print("No such data found, revert nn_query back to unfiltered default data")

# get both distance and indices with kneighbors() attribute for custom input
nn_distance, nn_indices = nn.kneighbors(nn_query)

# get graph for plot
nn_graph = nn.kneighbors_graph(nn_query).toarray()

In [317]:
print(nn_graph.shape)

(18, 38531)


In [318]:
# index = np.random.randint(indices.shape[0])
# while(index != -1):
# print("Available Pages: {} x {} rows.".format(nn_indices.shape[0], 10))
index = 0
a = nn_indices[index]

# create distance dataframe to concatenate with original dataset based on given index
nn_distance = pd.DataFrame(nn_distance[index], columns=["distance"])
nn_result = pd.DataFrame(columns=dataset.columns)

for i in range(len(a)):
  nn_result.loc[i] = dataset.iloc[a[i], :]

# concatenate both dataframes into one
nn_result = pd.concat([nn_distance, nn_result], axis = 1)

# display result based on custom sort by column
display(nn_result.sort_values(by=["distance"]))
# index = int(input("Go to page: "))

Unnamed: 0,distance,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,has_warranty,state,drivetrain,price_usd,is_exchangeable,location_region,up_counter,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,0.0,Subaru,Tribeca,automatic,black,188000,2006,gasoline,False,gasoline,3.0,suv,False,owned,all,8650.0,False,Минская обл.,25,False,True,False,False,True,True,False,False,True,True,37
1,0.026847,Subaru,Tribeca,automatic,blue,179000,2008,gasoline,False,gasoline,3.6,suv,False,owned,all,9900.0,False,Минская обл.,8,False,True,True,False,True,True,True,False,True,True,58
2,0.033468,Subaru,Tribeca,automatic,grey,154000,2008,gasoline,False,gasoline,3.6,suv,False,owned,all,8900.0,False,Минская обл.,16,False,True,True,True,True,True,False,True,True,True,77
3,0.035331,Subaru,Tribeca,automatic,black,237000,2006,gasoline,False,gasoline,3.0,suv,False,owned,all,8600.0,False,Минская обл.,58,False,True,False,True,True,True,False,False,True,True,301
4,0.040852,Subaru,Tribeca,automatic,grey,257495,2007,gasoline,False,gasoline,3.6,suv,False,owned,all,9900.0,False,Минская обл.,2,False,True,False,False,True,True,False,True,True,True,130
5,0.044594,Subaru,Tribeca,automatic,grey,252000,2007,gasoline,False,gasoline,3.0,suv,False,owned,all,8200.0,False,Могилевская обл.,1,False,False,False,False,False,False,False,False,False,True,10
6,0.052285,Subaru,Tribeca,automatic,grey,145000,2007,gasoline,False,gasoline,3.6,suv,False,owned,all,10750.0,False,Минская обл.,6,False,True,False,False,True,True,False,False,True,True,70
7,0.05507,Subaru,Tribeca,automatic,silver,140000,2008,gasoline,False,gasoline,3.6,suv,False,owned,all,10100.0,False,Минская обл.,1,False,True,False,True,True,True,False,False,False,True,68
8,0.055233,Subaru,Tribeca,automatic,white,140000,2007,gasoline,False,gasoline,3.6,suv,False,owned,all,10700.0,False,Минская обл.,11,False,True,False,False,True,True,False,False,True,True,63
9,0.055517,Subaru,Tribeca,automatic,silver,241402,2005,gasoline,False,gasoline,3.0,suv,False,owned,all,7500.0,False,Минская обл.,18,False,True,False,True,True,True,True,False,True,True,188


# **K-Means Clustering (Experiment to be Compared)**

In [0]:
# Data Normalization
kmeans_data = mms().fit_transform(cluster_data)

# Dimension Reduction
kmeans_data = PCA(n_components=10).fit_transform(kmeans_data)

In [0]:
# K-Means Clustering
kmeans = KMeans(n_clusters=10)

# Fit and predict the same data at once
# clusters = kmeans.fit_predict(kmeans_data)

# Fit then predict with custom query
clusters = kmeans.fit(kmeans_data)

# custom query, if query is empty, revert query back to fitted data
kmeans_query = kmeans_data[(dataset.manufacturer_name == "Subaru") & (dataset.engine_capacity == 3) & (dataset.price_usd >= 1000)]
kmeans_query = kmeans_data if len(kmeans_query) == 0 else kmeans_query

if(len(kmeans_query) == 0):
  print("No such data found, revert kmeans_query back to unfiltered default data")

# predicting clusters from given query based on n_clusters given
clusters = clusters.predict(kmeans_query)

In [0]:
# K-Means Clustering doesn't have distance attribute, solution: import one of many distance metrics from scipy
# and use euclidean distance to calculate distance between each data and center of all clusters
# euclidean distance => dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
from sklearn.metrics.pairwise import euclidean_distances

random_cluster = np.random.randint(len(clusters))

# get indexes where clusters predicted is at randomized number in the clusters list by randomizing its index, then turn it to list
cluster_indices = np.where(np.isin(clusters, clusters[random_cluster]))[0].tolist()

# get clustered datas with indices of cluster 0
kmeans_result = kmeans_data[cluster_indices]

# calculate euclidean distance from library, transpose from certain rows x n_clusters columns to n_clusters row x certain columns to ease up next step
kmeans_distance = euclidean_distances(kmeans_result, kmeans.cluster_centers_).T

get_n_data = kmeans_distance.shape[1]

# make distances of cluster 0 to a dataframe to concatenate to original dataset
kmeans_distance = pd.DataFrame(kmeans_distance[0][ : get_n_data], columns=["distance"])

In [367]:
# create new empty dataset with original dataset columns
cluster_results = pd.DataFrame(columns=dataset.columns)

# get n datas 
for i in range(get_n_data):
  cluster_results.loc[i] = dataset.iloc[cluster_indices[i]]

# concatenate both distance and cluster_results dataframe into one
cluster_results = pd.concat([kmeans_distance, cluster_results], axis = 1)

# print dataframe results and sort it by distance, the lower the distance, the closer to center of clusters
print("Distance to cluster", clusters[random_cluster])
display(cluster_results.sort_values(by = ["distance"]))

Distance to cluster 8


Unnamed: 0,distance,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,has_warranty,state,drivetrain,price_usd,is_exchangeable,location_region,up_counter,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
4,1.164906,Volkswagen,Passat,mechanical,blue,420000,1993,diesel,False,diesel,1.9,sedan,False,owned,front,1500.0,False,Гродненская обл.,12,True,False,False,False,False,False,False,False,False,False,75
9,1.223515,Mercedes-Benz,T2,mechanical,blue,243072,1995,diesel,False,diesel,4.0,van,False,owned,rear,14750.0,False,Минская обл.,357,True,False,False,False,False,False,False,False,False,False,565
1,1.302357,Volvo,S80,automatic,black,304000,1999,gasoline,False,gasoline,2.9,sedan,False,owned,front,3000.0,False,Могилевская обл.,1,True,False,False,False,False,False,False,False,False,False,0
2,1.30532,BMW,525,mechanical,grey,280000,2000,diesel,False,diesel,2.5,universal,False,owned,rear,5490.0,False,Гомельская обл.,7,False,True,True,True,True,True,False,True,True,True,46
10,1.356351,Citroen,C4,automatic,black,130000,2012,gasoline,False,gasoline,1.6,hatchback,False,owned,front,8000.0,False,Гомельская обл.,10,False,True,False,True,False,True,False,True,True,True,59
0,1.557329,Mitsubishi,Lancer,automatic,black,200000,2005,gasoline,False,gasoline,2.0,sedan,False,owned,front,3500.0,True,Минская обл.,21,False,True,True,True,False,True,False,False,False,True,56
3,1.573124,Daewoo,Leganza,mechanical,blue,291000,2000,gasoline,False,gasoline,2.0,sedan,False,owned,front,1990.0,False,Гродненская обл.,5,False,True,False,False,False,False,False,False,False,False,7
7,1.717382,Volvo,XC90,automatic,black,200000,2003,gasoline,False,gasoline,2.9,suv,False,owned,all,6999.0,True,Гродненская обл.,138,False,True,False,False,True,True,False,False,True,True,463
6,1.957756,Honda,Civic,mechanical,silver,300000,1992,gasoline,False,gasoline,1.3,hatchback,False,owned,front,1350.0,True,Витебская обл.,28,False,False,False,False,True,False,False,False,False,True,36
8,1.960048,Opel,Astra,mechanical,silver,193750,2007,gasoline,False,gasoline,1.8,hatchback,False,owned,front,5490.0,True,Минская обл.,2,False,True,False,False,False,True,False,False,True,False,57


### Planned but Cancelled Methods:
1. SVD (Matrix Factorization)
2. Random Forest
3. Weighted Linear Regression (WLRSS)

# Machine Learning Project Class LB01  By:
- Bryan Leonardo 22017645004
- Rio Nagano 2201767232
- Ronaldo Kenny Chandra 2201763234