In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score

## K-Nearest Neighbors algorithm (Using functions)

In [2]:
def euclidean_distance(tr_X,tst_X):
    d_t = tr_X - tst_X
    dist = np.sqrt(np.dot(d_t,np.transpose(d_t)))
    
    return dist

In [3]:
def NN_func(train_x,train_y,x,K,prb_typ):
    
    NN_dist = dict()
    y_pred = list()
    
    # Calculate the nearest neighbors distance 
    for i in range(len(train_x)):
        NN_dist[i] = euclidean_distance(train_x[i], x)
    
    # Sort in nearest neighbors
    NN_dist = list(sorted(NN_dist.items(), key=lambda x: x[1]))
    for idx in NN_dist[:K]:
        y_pred.append(train_y.loc[idx[0]])
    
    # Output based on regression/classification
    if prb_typ == 'reg':
        return np.mean(y_pred)
    else:
        return max(y_pred, key = y_pred.count)

In [4]:
def KNN_func(train_x,train_y,test_x,test_y,n_neighbors=1):
    
    #Feature scaling the variables in training and test data
    scaled_train_x = train_x.apply(lambda x: (x - np.mean(x))/np.std(x))
    scaled_test_x = test_x.apply(lambda x: (x - np.mean(x))/np.std(x))
    
    #Converting the training and test data to matrices
    x_train_matrix = scaled_train_x.values
    x_test_matrix = scaled_test_x.values
    
    # Creating a dataframe to record actual v/s predicted values
    prediction_df = {'Actual value':[],
                     'Predicted value':[]}
    
    # Classifying the type of problem: regression/classification
    if len(y_train.unique()) > 10:
        reg_class = 'reg'
    else:
        reg_class = 'class'
    
    # Prediction on all observations from test data
    for i in range(len(x_test_matrix)):
        KNN_pred = NN_func(x_train_matrix,train_y,x_test_matrix[i],n_neighbors,reg_class)
    
        #Checking for 1-NN for the test observations
        prediction_df['Actual value'].append(test_y.iloc[i])
        prediction_df['Predicted value'].append(KNN_pred)
            
    return pd.DataFrame(prediction_df)

## `a) KNN Classification`

### Preparing data

In [5]:
penguins = pd.read_csv('penguins_size.csv')
penguins.shape

(344, 7)

In [6]:
penguins.dropna(inplace=True)
penguins.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [7]:
x = penguins.drop('species', axis=1)
y = penguins['species']

In [8]:
x = pd.get_dummies(x, drop_first = True)
x.sample(4)

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,sex_MALE
26,40.6,18.6,183.0,3550.0,0,0,1
119,41.1,18.6,189.0,3325.0,0,1,1
176,46.7,17.9,195.0,3300.0,1,0,0
226,45.4,14.6,211.0,4800.0,0,0,0


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify=y, random_state = 100)

In [10]:
(len(y_train), len(y_test))

(233, 101)

In [11]:
y_train.reset_index(drop = True, inplace=True)
y_test.reset_index(drop = True, inplace=True)

### Model building and evaluation

In [12]:
y_pred = KNN_func(x_train,y_train,x_test,y_test)

In [13]:
y_pred.head(10)

Unnamed: 0,Actual value,Predicted value
0,Adelie,Adelie
1,Adelie,Adelie
2,Gentoo,Gentoo
3,Adelie,Adelie
4,Adelie,Adelie
5,Adelie,Adelie
6,Adelie,Adelie
7,Chinstrap,Chinstrap
8,Chinstrap,Chinstrap
9,Adelie,Adelie


In [14]:
print('Confusion Matrix')
print('-'*50)
print(confusion_matrix(y_pred['Actual value'], y_pred['Predicted value']))
print('-'*50)
print('\nClassification Report')
print('-'*50)
print(classification_report(y_pred['Actual value'], y_pred['Predicted value']))

Confusion Matrix
--------------------------------------------------
[[44  0  0]
 [ 1 20  0]
 [ 0  0 36]]
--------------------------------------------------

Classification Report
--------------------------------------------------
              precision    recall  f1-score   support

      Adelie       0.98      1.00      0.99        44
   Chinstrap       1.00      0.95      0.98        21
      Gentoo       1.00      1.00      1.00        36

    accuracy                           0.99       101
   macro avg       0.99      0.98      0.99       101
weighted avg       0.99      0.99      0.99       101



## `b) KNN Regression`

### Preparing the data

In [15]:
train = pd.read_csv("wk3_kc_house_train_data.csv")
test = pd.read_csv("wk3_kc_house_test_data.csv")

(train.shape, test.shape)

((9761, 21), (2217, 21))

In [16]:
train.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
1,7237550310,20140512T000000,1225000.0,4,4.5,5420,101930,1.0,0,0,...,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930


In [17]:
test.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,3793500160,20150312T000000,323000.0,3,2.5,1890,6560,2.0,0,0,...,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570
1,1175000570,20150312T000000,530000.0,5,2.0,1810,4850,1.5,0,0,...,7,1810,0,1900,0,98107,47.67,-122.394,1360,4850


In [18]:
x_train = train.drop(['id','date','price'], axis=1)
x_test = test.drop(['id','date','price'], axis=1)

y_train = train['price']
y_test = test['price']

### Model Building and evaluation

In [19]:
y_pred = KNN_func(x_train,y_train,x_test,y_test,n_neighbors = 10)

In [20]:
y_pred.head()

Unnamed: 0,Actual value,Predicted value
0,323000.0,304515.0
1,530000.0,621500.0
2,189000.0,203170.0
3,687500.0,583030.0
4,240000.0,204924.8


In [21]:
print("RMSE of KNN Regressor model= ",np.sqrt(mean_squared_error(y_pred['Predicted value'], y_pred['Actual value'])))
print('R-square of KNN Regressor model',r2_score(y_pred['Actual value'],y_pred['Predicted value']))

RMSE of KNN Regressor model=  166553.56595339207
R-square of KNN Regressor model 0.7839701029091093
