# Scikit-Learn California Housing Dataset


    MedInc - median income in block group
    HouseAge - median house age in a block group
    AveRooms - the average number of rooms (provided per household)
    AveBedrms - the average number of bedrooms (provided per household)
    Population - block group population
    AveOccup - the average number of household members
    Latitude - block group latitude
    Longitude - block group longitude
    MedHouseVal - median house value for California districts (hundreds of thousands of dollars)


In [4]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

df = fetch_california_housing(as_frame=True).frame
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [5]:
print(df.shape)

(20640, 9)


Делим по категориям

In [20]:
df["MedHouseValCat"] = pd.qcut(df["MedHouseVal"], 4, retbins=False, labels=[1, 2, 3, 4])

y = df['MedHouseValCat']
X = df.drop(['MedHouseVal', 'MedHouseValCat'], axis = 1)
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MedInc,20640.0,3.870671,1.899822,0.4999,2.5634,3.5348,4.74325,15.0001
HouseAge,20640.0,28.639486,12.585558,1.0,18.0,29.0,37.0,52.0
AveRooms,20640.0,5.429,2.474173,0.846154,4.440716,5.229129,6.052381,141.909091
AveBedrms,20640.0,1.096675,0.473911,0.333333,1.006079,1.04878,1.099526,34.066667
Population,20640.0,1425.476744,1132.462122,3.0,787.0,1166.0,1725.0,35682.0
AveOccup,20640.0,3.070655,10.38605,0.692308,2.429741,2.818116,3.282261,1243.333333
Latitude,20640.0,35.631861,2.135952,32.54,33.93,34.26,37.71,41.95
Longitude,20640.0,-119.569704,2.003532,-124.35,-121.8,-118.49,-118.01,-114.31


Видно, что большой разброс по медианным значениям. Нормализируем данные

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape)

(15480, 8) (5160, 8)


In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

col_names=['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms',
           'Population', 'AveOccup', 'Latitude', 'Longitude']
pd.DataFrame(X_train, columns=col_names).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MedInc,15480.0,2.074711e-16,1.000032,-1.774632,-0.688854,-0.175663,0.46445,5.842113
HouseAge,15480.0,-1.232434e-16,1.000032,-2.188261,-0.840224,0.032036,0.666407,1.855852
AveRooms,15480.0,-1.620294e-16,1.000032,-1.877586,-0.407008,-0.08394,0.257082,56.357392
AveBedrms,15480.0,7.435912000000001e-17,1.000032,-1.740123,-0.205765,-0.108332,0.007435,55.925392
Population,15480.0,-8.996536000000001e-17,1.000032,-1.246395,-0.558886,-0.227928,0.262056,29.971725
AveOccup,15480.0,1.055716e-17,1.000032,-0.201946,-0.056581,-0.024172,0.014501,103.737365
Latitude,15480.0,7.890329e-16,1.000032,-1.451215,-0.79982,-0.645172,0.971601,2.953905
Longitude,15480.0,2.206676e-15,1.000032,-2.380303,-1.106817,0.536231,0.785934,2.633738


# Модель

In [23]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors = 10)
classifier.fit(X_train, y_train)

In [24]:
y_pred = classifier.predict(X_test)

# Ошибка

In [25]:
acc =  classifier.score(X_test, y_test)
print(acc)

0.6191860465116279
