# Q1. Write a Python code to implement the KNN classifier algorithm on load_iris dataset in sklearn.datasets.

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
iris = load_iris()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2)

# Create the KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Train the KNN classifier on the training data
knn.fit(X_train, y_train)

# Use the trained KNN classifier to predict the test data
y_pred = knn.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9666666666666667


# Q2. Write a Python code to implement the KNN regressor algorithm on load_boston dataset in sklearn.datasets.

In [3]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv")
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [5]:
df.isna().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

In [6]:
X = df.drop(labels=['medv'],axis=1)
Y = df['medv']

In [7]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y, test_size=0.2,random_state=42)

In [8]:
xtrain.shape, ytrain.shape

((404, 13), (404,))

In [9]:
xtest.shape, ytest.shape

((102, 13), (102,))

In [10]:
from sklearn.neighbors import KNeighborsRegressor
knnr = KNeighborsRegressor(n_neighbors=5)

In [11]:
knnr.fit(xtrain,ytrain)

In [12]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(knnr,xtrain,ytrain,cv=5,scoring='r2')
scores

array([0.54227727, 0.43905346, 0.35690125, 0.53955425, 0.4431922 ])

In [13]:
import numpy as np
cval_score = np.mean(scores)
print(f'Mean Cross Validation R2 Score is : {cval_score:.4f}')

Mean Cross Validation R2 Score is : 0.4642


In [14]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # number of neighbors to consider
    'weights': ['uniform', 'distance'],  # weight function used in prediction
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # algorithm used to compute nearest neighbors
    'p': [1, 2]  # power parameter for Minkowski distance metric
}

In [15]:
from sklearn.model_selection import GridSearchCV
gscv = GridSearchCV(KNeighborsRegressor(),param_grid=param_grid,
                    cv=5, scoring='neg_mean_squared_error')

In [16]:
gscv.fit(xtrain,ytrain)

In [17]:
gscv.best_params_

{'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [18]:
gscv.best_score_

-33.2971448976826

In [19]:
ypred_test = gscv.predict(xtest)
ypred_test

array([24.73786364, 31.72249009, 15.3507154 , 27.30501395, 17.2018207 ,
       25.80550606, 19.77102868, 15.86582451, 19.94556132, 20.51957611,
       23.34136189, 19.56304115, 12.34755581, 21.52977457, 23.20852325,
       22.26416486, 19.81820761, 14.24896371, 35.11804765, 11.62004094,
       27.83111718, 31.06348918, 16.3713894 , 19.4077173 , 17.60615938,
       20.66688916, 23.21525052, 11.55758578, 22.57776747, 20.58288984,
       24.09667949, 22.46416373, 10.30494761, 31.44171256, 20.23677213,
       21.15529179, 23.81484376, 19.24104178, 22.3877953 , 27.40639917,
       20.18090339, 31.74240806, 39.07315304, 21.66989478, 24.86095688,
       12.07345738, 16.41442709, 27.53919149, 19.93014439, 22.67300676,
       22.39242834, 36.48125731, 17.49914887, 22.49907307, 38.70586535,
       21.08122549, 12.42356856, 36.2305086 , 22.40289116, 21.37946557,
       21.62040381, 38.67943081, 35.59528839, 15.93855689, 29.07592693,
       20.38622952, 11.82438563, 26.09728109, 34.24273893, 13.94

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(ytest, ypred_test)
mse = mean_squared_error(ytest,ypred_test)
rmse = mse**(1/2)
r2 = r2_score(ytest,ypred_test)

In [21]:
print(f'Testing MAE  : {mae:.2f}')
print(f'Testing MSE  : {mse:.2f}')
print(f'Testing RMSE : {rmse:.2f}')
print(f'Testing R2   : {r2:.4f}')

Testing MAE  : 3.27
Testing MSE  : 21.00
Testing RMSE : 4.58
Testing R2   : 0.7137


# Q3. Write a Python code snippet to find the optimal value of K for the KNN classifier algorithm using cross-validation on load_iris dataset in sklearn.datasets.

In [22]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
iris = load_iris()

# Define the range of K values to test
k_range = range(1, 31)

# Create an empty list to store the mean accuracy scores for each value of K
k_scores = []

# Perform cross-validation for each value of K
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    # Define the cross-validation method (here we use 10-fold cross-validation)
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    # Calculate the mean accuracy score using cross-validation
    scores = cross_val_score(knn, iris.data, iris.target, cv=cv, scoring='accuracy')
    k_scores.append(scores.mean())

# Find the optimal value of K with the highest mean accuracy score
optimal_k = k_range[k_scores.index(max(k_scores))]
print(f"Optimal value of K: {optimal_k}")

Optimal value of K: 14


# Q4. Implement the KNN regressor algorithm with feature scaling on load_boston dataset in sklearn.datasets.

In [23]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv")
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [24]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv")
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [25]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.3,random_state=42)
xtrain.shape, ytrain.shape

((354, 13), (354,))

In [26]:
xtest.shape, ytest.shape

((152, 13), (152,))

In [27]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
xtrain = pd.DataFrame(scaler.fit_transform(xtrain),columns=xtrain.columns)
xtest = pd.DataFrame(scaler.transform(xtest),columns=xtest.columns)
xtrain.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,-0.414259,-0.505125,-1.292142,-0.281546,-0.851085,0.145264,-0.365584,1.081628,-0.746179,-1.11279,0.187271,0.396514,-1.015316
1,-0.402008,-0.505125,-0.162083,-0.281546,-0.087967,-0.208401,0.133941,-0.487876,-0.398464,0.150088,-0.21209,0.387067,-0.053663
2,-0.397211,-0.505125,-0.609489,-0.281546,-0.936828,-0.896237,-1.2669,0.628596,-0.746179,-1.046639,-0.167716,0.428541,-0.311324
3,-0.290936,-0.505125,-0.43197,-0.281546,-0.165136,-0.543965,-1.429789,0.345133,-0.630274,-0.601625,1.207859,0.31276,-0.822422
4,1.457816,-0.505125,1.0055,-0.281546,0.194987,-0.556496,0.079645,-0.403892,1.687825,1.557294,0.852872,0.104124,0.8038


In [28]:
xtest.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,-0.406801,-0.505125,-1.022256,-0.281546,-0.405218,0.125771,0.553831,-0.527348,-0.514369,-0.667776,-0.744571,0.412412,-0.476058
1,-0.411057,1.266629,-0.681651,3.551814,-0.945403,0.601965,-1.299478,0.146831,-0.630274,-0.920352,-0.300837,0.428541,-1.251858
2,-0.40512,-0.505125,2.396785,-0.281546,0.443643,-0.47713,1.085934,-0.8939,-0.630274,1.827911,0.808498,0.350317,0.795352
3,-0.406818,-0.505125,-0.046624,-0.281546,-1.236931,-0.362955,-2.208034,0.716631,-0.630274,-0.613653,0.409138,0.359533,-0.971669
4,0.195119,-0.505125,1.0055,-0.281546,1.335376,-0.039923,0.832552,-0.65834,1.687825,1.557294,0.852872,0.292484,0.682714


In [29]:
from sklearn.neighbors import KNeighborsRegressor
knnr = KNeighborsRegressor(n_neighbors=5)
knnr.fit(xtrain,ytrain)


In [30]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(knnr,xtrain,ytrain,scoring='r2',cv=5)
scores

array([0.70757722, 0.6421702 , 0.71614969, 0.71381021, 0.72490399])

In [31]:
import numpy as np
cval_score = np.mean(scores)
print(f'Cross Validation R2 Score on training data is {cval_score:.4f}')

Cross Validation R2 Score on training data is 0.7009


In [32]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # number of neighbors to consider
    'weights': ['uniform', 'distance'],  # weight function used in prediction
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # algorithm used to compute nearest neighbors
    'p': [1, 2]  # power parameter for Minkowski distance metric
}

In [33]:
from sklearn.model_selection import GridSearchCV
gscv_boston = GridSearchCV(KNeighborsRegressor(),param_grid=param_grid,cv=5,scoring='r2')

In [34]:
gscv_boston.fit(xtrain,ytrain)

In [35]:
gscv_boston.best_params_

{'algorithm': 'auto', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}

In [36]:
gscv_boston.best_score_

0.7829978784524041

In [37]:
ypred_test = gscv.predict(xtest)
ypred_test[0:5]

array([19.74435034, 19.74259828, 19.74302943, 19.74454149, 19.73959965])

In [38]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(ytest, ypred_test)
mse = mean_squared_error(ytest,ypred_test)
rmse = mse**(1/2)
r2 = r2_score(ytest,ypred_test)

In [39]:
print(f'Testing MAE  : {mae:.2f}')
print(f'Testing MSE  : {mse:.2f}')
print(f'Testing RMSE : {rmse:.2f}')
print(f'Testing R2   : {r2:.4f}')

Testing MAE  : 5.93
Testing MSE  : 77.25
Testing RMSE : 8.79
Testing R2   : -0.0367


# Q5. Write a Python code snippet to implement the KNN classifier algorithm with weighted voting on load_iris dataset in sklearn.datasets.

In [40]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
iris = load_iris()

# Split the dataset into features (X) and target variable (y)
X = iris.data
y = iris.target

# Create the KNN classifier with k=5 and weighted voting
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')

# Train the KNN classifier on the entire dataset
knn.fit(X, y)

# Predict the class labels for new data
new_data = [[6.1, 3.1, 5.1, 1.9], [5.7, 2.8, 4.1, 1.3]]
y_pred = knn.predict(new_data)

print("Predicted class labels:", y_pred)

Predicted class labels: [2 1]


# Q6. Implement a function to standardise the features before applying KNN classifier.

In [41]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Load the dataset
iris = load_iris()

# Split the dataset into features (X) and target variable (y)
X = iris.data
y = iris.target

# Define a function to standardize the features
def standardize(X):
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)
    return X_std

# Standardize the features
X_std = standardize(X)

# Create the KNN classifier with k=5
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier on the standardized data
knn.fit(X_std, y)

# Predict the class labels for new data
new_data = [[6.1, 3.1, 5.1, 1.9], [5.7, 2.8, 4.1, 1.3]]
new_data_std = standardize(new_data)
y_pred = knn.predict(new_data_std)

print("Predicted class labels:", y_pred)

Predicted class labels: [2 0]


# Q7. Write a Python function to calculate the euclidean distance between two points.

In [42]:
from sklearn.datasets import load_iris
import math

# Load the iris dataset
iris = load_iris()

# Select two random data points from the dataset
x1 = iris.data[0]
x2 = iris.data[50]

# Define a function to calculate the Euclidean distance
def euclidean_distance(x1, x2):
    distance = 0
    for i in range(len(x1)):
        distance += (x1[i] - x2[i])**2
    distance = math.sqrt(distance)
    return distance

# Calculate the Euclidean distance between the two points
distance = euclidean_distance(x1, x2)

# Print the result
print("Euclidean distance between x1 and x2:", distance)

Euclidean distance between x1 and x2: 4.003748243833521


# Q8. Write a Python function to calculate the manhattan distance between two points.

In [43]:
from sklearn.datasets import load_iris

# Load the iris dataset
iris = load_iris()

# Select two random data points from the dataset
x1 = iris.data[0]
x2 = iris.data[50]

# Define a function to calculate the Manhattan distance
def manhattan_distance(x1, x2):
    distance = 0
    for i in range(len(x1)):
        distance += abs(x1[i] - x2[i])
    return distance

# Calculate the Manhattan distance between the two points
distance = manhattan_distance(x1, x2)

# Print the result
print("Manhattan distance between x1 and x2:", distance)

Manhattan distance between x1 and x2: 6.7
