In [1]:
import pandas as pd
import numpy as np

In [28]:
cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars = pd.read_csv('imports-85.data', names=cols)

cars.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [29]:
# Select only the columns with continuous values from - https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.names
continuous_values_cols = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
car_num = cars[continuous_values_cols]
car_num.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,?,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495
1,?,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500
2,?,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500
3,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950
4,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450


### Data Cleaning


In [30]:
# replace ? with numpay NaN
car_num = car_num.replace('?', np.nan)
car_num.head(3)

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495
1,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500
2,,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500


In [31]:
# convert all columns to float
car_num = car_num.astype('float')
# check for missing value
car_num.isnull().sum()

normalized-losses    41
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [32]:
# Remove missing values from price column since it is the target variable
car_num = car_num.dropna(subset=['price'])
car_num.isnull().sum()


normalized-losses    37
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [33]:
# replacing other columns missing values with their respective means
car_num = car_num.fillna(car_num.mean())

In [34]:
# check for missing value to confirm inputation
car_num.isnull().sum()

normalized-losses    0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 0
stroke               0
compression-rate     0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [35]:
# Normalizing the data to range from 0 to 1 except the target column
car_price = car_num['price']
car_num = (car_num - car_num.min()) / (car_num.max() - car_num.min())
car_num['price'] = car_price

### Univariate Model

In [38]:
from sklearn.neighbors  import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

def knn_train_test(train_col, target_col, data):
    knn = KNeighborsRegressor()
    np.random.seed(1)
    
    # Randomixe the order of row in the data frame
    shuffled_index = np.random.permutation(data.index)
    rand_df = data.reindex(shuffled_index)
    
    # Divide number of rows in half and round
    middle_num = int(len(rand_df) / 2)
    
    # Select training amd testing set
    train_df = rand_df.iloc[0:middle_num]
    test_df = rand_df.iloc[middle_num:]
    
    # Fit model with default k value
    knn .fit(train_df[[train_col]], train_df[target_col])
    
    # Make predictions 
    pred_labels = knn.predict(test_df[[train_col]])
    
    # Calculate and return RMSE
    mse = mean_squared_error(test_df[target_col], pred_labels)
    rmse = np.sqrt(mse)
    return rmse

rmse_results = {}
train_cols = car_num.columns.drop('price')

# Training the model with each columns and the RMSE to the dictionary
for col in train_cols:
    rmse_val = knn_train_test(col, 'price', car_num)
    rmse_results[col] = rmse_val
    
# Create a series from the dictionary for easy view
series_rmse = pd.Series(rmse_results)
series_rmse.sort_values()
    

engine-size          3271.201399
horsepower           3998.452040
curb-weight          4410.757785
width                4619.368133
city-mpg             5089.138311
highway-mpg          5180.535835
length               5419.972211
wheel-base           5465.236829
compression-rate     6276.684871
bore                 6760.263778
peak-rpm             7373.272378
height               7592.144535
normalized-losses    7627.705962
stroke               8371.272109
dtype: float64