# California dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt

In [2]:
dataCalifornia = fetch_california_housing()

In [5]:
print(dataCalifornia['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [11]:
dataCalifornia['data'].shape

(20640, 8)

In [17]:
dataCalifornia['data']

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [19]:
dataCalifornia['target']

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [20]:
X_temp = dataCalifornia['data']

In [21]:
y_temp = dataCalifornia['target']

In [25]:
X, y = X_temp, y_temp

In [26]:
#work with normalisated data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

X_norm_minmax = MinMaxScaler().fit_transform(X, y)
X_norm_std = StandardScaler().fit_transform(X, y)

In [None]:
#add ones column


In [27]:
#split data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [50]:
#создаем класс для обучения всех моделей

from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

class complex_fit():
    def __init__(self, n_clusters=2, min_samples=35, X=[1,0,1]):
        self.n_clusters = n_clusters
        self.min_samples = min_samples
        self.X = X
        self.model_names = ['kmeans', 'gauss', 'aglomer', 'dbscan']
        pass
    
    def y_pred_dbscan(self):
        dbscan = DBSCAN(eps=0.9, min_samples=self.min_samples)
        dbscan.fit(self.X)
        y_pred_dbscan = dbscan.labels_
        return y_pred_dbscan
    
    def y_pred_ac(self):
        ac = AgglomerativeClustering(n_clusters=self.n_clusters)
        ac.fit(self.X)
        y_pred_ac = ac.labels_.astype(np.int)
        return y_pred_ac
    
    def y_pred_gm(self):
        gm = GaussianMixture(n_components=self.n_clusters, random_state=42)
        y_pred_gm = gm.fit_predict(self.X)
        return y_pred_gm
    
    def y_pred_km(self):
        k_means = KMeans(n_clusters=self.n_clusters, random_state=42)
        k_means.fit(self.X)
        #k_means.cluster_centers_
        y_pred_km = k_means.labels_
        return y_pred_km

    def pred_array(self):
        x = [self.y_pred_km(), 
        self.y_pred_gm(), 
        self.y_pred_ac(), 
        self.y_pred_dbscan()]
        return x


In [52]:
cf = complex_fit(X=X_train)
pred_array = cf.pred_array()
model_names = cf.model_names

In [53]:
for x in model_names:
    print(x)

kmeans
gauss
aglomer
dbscan


In [71]:

from sklearn.metrics.cluster import v_measure_score, silhouette_score, homogeneity_score, completeness_score
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, roc_curve, roc_auc_score

for n, pred in enumerate(pred_array):
    x1 = completeness_score(labels_true=y_train, labels_pred=pred)
    print('полнота', model_names[n], x1)
    x2 = homogeneity_score(labels_true=y_train, labels_pred=pred)
    print('однородность', model_names[n], x2)
    x3 = v_measure_score(labels_true=y_train, labels_pred=pred)
    print('V мера', model_names[n], x3)
    x4 = silhouette_score(X=X_train, labels=pred)
    print('силуэт', model_names[n], x4)
#     x5 = mean_squared_error(y_train, pred)
#     print('MSE score', model_names[n], x5)
#     x6 =f1_score(y_train, pred)
#     print('F1 score', model_names[n], x6)



полнота kmeans 0.30750498824388267
однородность kmeans 0.014357445828095873
V мера kmeans 0.0274339950439415
силуэт kmeans 0.7070736864422698
полнота gauss 0.36957091256742164
однородность gauss 0.009501597491367477
V мера gauss 0.01852687263019106




силуэт gauss 0.72719416548616
полнота aglomer 0.46501102513618797
однородность aglomer 0.006786044347838329
V мера aglomer 0.013376875961773114




силуэт aglomer 0.807260299424983
полнота dbscan 1.0
однородность dbscan 0.0
V мера dbscan 0.0




ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)