In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
df = pd.read_csv(url)
df.columns = [ "Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings"]
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
1,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
2,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
3,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
4,I,0.425,0.300,0.095,0.3515,0.1410,0.0775,0.1200,8
...,...,...,...,...,...,...,...,...,...
4171,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4172,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4173,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4174,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [3]:
# df['Sex'] = df['Sex'].replace({'M': 0, 'F': 1, 'I':2})
#df = df.drop('Sex', axis=1)

X=df[['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight', 'Shell weight']].to_numpy()
y = df["Rings"].to_numpy()



In [4]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

model = KNeighborsRegressor(n_neighbors=10)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)



In [9]:
# Create lists to store K values and corresponding errors
np.random.seed(42)
k_values = []
mse_values = []
rmse_values = []
mae_values = []

# Assuming you have a range of K values in a list called k_range
k_range = [3,10, 15,20, 23, 24, 28, 100, 1000, 2500]  # You can adjust this as needed

for k in k_range:
    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # Create and train the KNeighborsRegressor model
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(x_train, y_train)

    # Predict on the testing set
    y_pred = model.predict(x_test)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)

    # Append values to the lists
    k_values.append(k)
    mse_values.append(mse)
    rmse_values.append(rmse)
    mae_values.append(mae)

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    'K Value': k_values,
    'MSE': mse_values,
    'RMSE': rmse_values,
    'MAE': mae_values
})

# Display the results in a table
print(results_df)
print("Overall, based on this evaluation, a K value of 20 appears to be the most suitable choice,\n as it results in the lowest MSE, RMSE, and MAE, suggesting it provides the best \n trade-off between bias and variance for the model on this specific dataset and problem.")

   K Value       MSE      RMSE       MAE
0        3  6.044870  2.458632  1.696196
1       10  4.664190  2.159674  1.538308
2       15  4.548944  2.132825  1.483852
3       20  4.427161  2.104082  1.489904
4       23  4.734989  2.176003  1.540789
5       24  4.548640  2.132754  1.501762
6       28  4.757035  2.181063  1.522888
7      100  4.715479  2.171515  1.547733
8     1000  7.369646  2.714709  2.021717
9     2500  8.648563  2.940844  2.186286
Overall, based on this evaluation, a K value of 20 appears to be the most suitable choice,
 as it results in the lowest MSE, RMSE, and MAE, suggesting it provides the best 
 trade-off between bias and variance for the model on this specific dataset and problem.


In [10]:
gs = GridSearchCV(estimator = KNeighborsRegressor(),
                  param_grid = {"n_neighbors": range(1,51),
                                "weights": ['uniform', 'distance'],
                                "p": [1,2]},
                  cv=5)

gs.fit(x_train, y_train)

results = pd.DataFrame(gs.cv_results_)
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006209,0.001871,0.008219,0.00336,1,1,uniform,"{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}",0.289718,0.227523,0.265307,0.142019,0.223713,0.229656,0.050197,197
1,0.005668,0.002045,0.008714,0.001647,1,1,distance,"{'n_neighbors': 1, 'p': 1, 'weights': 'distance'}",0.289718,0.227523,0.265307,0.142019,0.223713,0.229656,0.050197,197
2,0.004023,2.7e-05,0.011335,0.001631,1,2,uniform,"{'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}",0.232198,0.199548,0.185058,0.225843,0.305146,0.229559,0.04151,199
3,0.005583,0.001968,0.009613,0.001976,1,2,distance,"{'n_neighbors': 1, 'p': 2, 'weights': 'distance'}",0.232198,0.199548,0.185058,0.225843,0.305146,0.229559,0.04151,199
4,0.006403,0.001963,0.008811,0.001592,2,1,uniform,"{'n_neighbors': 2, 'p': 1, 'weights': 'uniform'}",0.440049,0.421267,0.404751,0.34246,0.416283,0.404962,0.033262,193
