In [27]:
# importing the required packages 

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
np.random.seed(42)


## Loading and cleaning the data 

In [12]:
# Loaded the data into a pandas data frame called SampleData
SampleData = pd.read_csv('./sample.csv')
SampleData

Unnamed: 0,x,y
0,55.3846,97.1795
1,51.5385,96.0256
2,46.1538,94.4872
3,42.8205,91.4103
4,40.7692,88.3333
...,...,...
137,39.4872,25.3846
138,91.2821,41.5385
139,50.0000,95.7692
140,47.9487,95.0000


In [13]:
# Checking for null values
SampleData.isna().sum()
# There are no null values in this data set 

x    0
y    0
dtype: int64

In [14]:
# check the variable types
SampleData.dtypes


x    float64
y    float64
dtype: object

## Split and normalize the data

In [16]:
'''
In the first part I have normalised the data set 
'''
scaler = MinMaxScaler()
SampleData_Scaled = pd.DataFrame(scaler.fit_transform(SampleData),index = SampleData.index, columns = SampleData.columns)

''' In the second part, I have split the data into training and test, using the train-test-split package. In this dataset we 
have only one feature x and one target y
'''

feature = SampleData_Scaled.drop(columns=['x'])
target = SampleData_Scaled['y']

# split the data into validation and training set
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.3, random_state=1)


In [24]:
print (SampleData_Scaled)


            x         y
0    0.067121  1.838544
1   -0.163101  1.795553
2   -0.485423  1.738236
3   -0.684950  1.623599
4   -0.807738  1.508959
..        ...       ...
137 -0.884477 -0.836338
138  2.215899 -0.234488
139 -0.255194  1.786000
140 -0.377982  1.757342
141 -0.608205  1.671363

[142 rows x 2 columns]


## Train a knn model

In [17]:
# Get the number of observations in the dataset
n = len(SampleData)
print(n)

142


In [18]:
# Since a good starting point for k is the square root of the number of observations k= 11 approximately 

knn_regressor = KNeighborsRegressor(n_neighbors=11)
knn_regressor.fit(x_train, y_train)

# Make a prediction on the test set
y_pred = knn_regressor.predict(x_test)

# Evaluate the model using the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.00013087828495047293


## Checking for other k values

In [26]:
# Initialize an empty results list
results = []
for k in range(1,100):
    
    knn_regressor = KNeighborsRegressor(n_neighbors=k)
    knn_regressor.fit(x_train, y_train)
    
    # Make a prediction on the test set
    y_pred = knn_regressor.predict(x_test)
    
    # Evaluate the model using the mean squared error
    mse = mean_squared_error(y_test, y_pred)
    
    results.append(mse)
    
 # Display the output

for indx in range(0,99):
    
    print( "For k = " + str(indx+1) + " Mean Square Error Is " + str(results[indx]))
    
    
print(min(results))    

# From the results it can be seen that for k=2 the rmse is 4.7197290932926475e-05 which is minimum 
    

For k = 1 Mean Square Error Is 5.171809782099886e-05
For k = 2 Mean Square Error Is 4.7197290932926475e-05
For k = 3 Mean Square Error Is 5.671896565853924e-05
For k = 4 Mean Square Error Is 6.875774969357902e-05
For k = 5 Mean Square Error Is 8.1099121522324e-05
For k = 6 Mean Square Error Is 8.770449409766878e-05
For k = 7 Mean Square Error Is 0.00011005565979025114
For k = 8 Mean Square Error Is 0.00013119291286904544
For k = 9 Mean Square Error Is 0.00012922473242744313
For k = 10 Mean Square Error Is 0.00013546423915036289
For k = 11 Mean Square Error Is 0.00013087828495047293
For k = 12 Mean Square Error Is 0.00013909309562615694
For k = 13 Mean Square Error Is 0.00015532364601533185
For k = 14 Mean Square Error Is 0.00017655505394900816
For k = 15 Mean Square Error Is 0.0001999754773556298
For k = 16 Mean Square Error Is 0.00025632652098005215
For k = 17 Mean Square Error Is 0.00029420497570869397
For k = 18 Mean Square Error Is 0.0003082731888287335
For k = 19 Mean Square Error

## Fitting a Linear Model

In [28]:
# Fitting the x train and the y train data on the linear Model
model = LinearRegression()
model.fit(x_train, y_train)



LinearRegression()

In [61]:
# Using the linear model to predict the target values


results_dataframe = y_test.to_frame()

# Store the values of the predicted x_test values (With the help of the Linear model) in the predicted column of the 
#results_dataframe

results_dataframe['predicted'] = model.predict(x_test)

mean_squared_error(results_dataframe['y'], results_dataframe['predicted']) 
#results_dataframe

3.1128402589514626e-32

From the results of the root mean square error we can see that the rms value for the Linear Model is way less than knn, which suggests that we should should use linear modeling for this dataset.
