### Regression

In [6]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

california = fetch_california_housing()

df_california = pd.DataFrame(california.data, columns = california.feature_names)
df_california["MedHouseVal"] = california.target

df_california.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

x= df_california.drop("MedHouseVal", axis = 1)
y = df_california["MedHouseVal"]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [13]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

knn_regressor = KNeighborsRegressor()
knn_regressor.fit(x_train_scaled, y_train)

y_pred = knn_regressor.predict(x_test_scaled)

# evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error : {mse}")
print(f"R2 Score : {r2}")

Mean Squared Error : 0.4324216146043236
R2 Score : 0.6700101862970989


## Classification

In [16]:
from sklearn.datasets import load_breast_cancer

breast_cancer = load_breast_cancer()

df_breast_cancer = pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names)
df_breast_cancer["target" ]  = breast_cancer.target

df_breast_cancer.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [19]:
x = df_breast_cancer.drop("target", axis=1)
y = df_breast_cancer["target"]

x_train,x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state =4)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn_classifier = KNeighborsClassifier()
knn_classifier.fit(x_train_scaled, y_train)

y_pred = knn_classifier.predict(x_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
#confusion = confusion_matrix(y_test, y_pred)
#classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.9649122807017544


##### Parameters

- n_neighbors = 5 (default)
- weights = uniform (default) [all points gets equal importance]
        - distance [low distance = get good importance]
- algorithm = auto (auto) [best to use]
- p = 2 (default) [used for euclidean distance]
- metrics = minkowski (default) [in formulal p = 2 default for euclidean and p =1 for manhatton]
    ######
    - manhatton (p=1)
        - better with high dimentional data
        - I/P columns are categorical and discrete
        - Perform well with outlier
    - euclidean (p=2)
        - poor with high dimentional data
        - I/P columns are continious
        - very sensitive to outliers
            

##### When to use : 
- simple of easy
- small and mid sized data
- can do both regressiona and classification

##### When Not to use:
- large dataset
- high dimentional data (curse of dimentionality)
- data has outliears (sensitive to outliears)
- SLOW - > Computationally Intensive (Stored overall data in your memory)
- imbalanced dataset
- will not perform if no feature scaling is done