In [1]:
# 1：ライブラリのインポート--------------------------------
import numpy as np #numpyという行列などを扱うライブラリを利用
import pandas as pd #pandasというデータ分析ライブラリを利用
import matplotlib.pyplot as plt #プロット用のライブラリを利用
from sklearn import cluster, preprocessing #機械学習用のライブラリを利用
from sklearn import datasets #使用するデータ

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
def compute_bic(kmeans,X):
    """
    Computes the BIC metric for a given clusters

    Parameters:
    -----------------------------------------
    kmeans:  List of clustering object from scikit learn

    X     :  multidimension np array of data points

    Returns:
    -----------------------------------------
    BIC value
    """
    # assign centers and labels
    centers = [kmeans.cluster_centers_]
    labels  = kmeans.labels_
    #number of clusters
    m = kmeans.n_clusters
    # size of the clusters
    n = np.bincount(labels)
    #size of data set
    N, d = X.shape

    #compute variance for all clusters beforehand
    cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)])

    const_term = 0.5 * m * np.log(N) * (d+1)

    BIC = np.sum([n[i] * np.log(n[i]) -
               n[i] * np.log(N) -
             ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
             ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term
    
    BIC = BIC*(-1)

    return(BIC, labels, m)

In [4]:
df = pd.read_csv(r"../data/src/ch4/athome_bunkyo.csv",usecols = ["物件番号","公開月","徒歩(分)","所在階","Area","賃料・価格","how_old"],dtype="str")
df = df[df["公開月"].str.contains("2015")].dropna()
df = df.drop_duplicates(subset = ["物件番号"])
del df["物件番号"], df["公開月"]
df["所在階"] = df["所在階"].replace('B1', '-1')
df = df.astype(float)
df.head(1)

Unnamed: 0,徒歩(分),賃料・価格,所在階,Area,how_old
40,9.0,65000.0,3.0,20.0,374.4


In [5]:
test_array = np.array([df["徒歩(分)"].tolist(),df["所在階"].tolist(),df["Area"].tolist(),df["how_old"].tolist(),df["賃料・価格"].tolist()], np.float).T

#標準化
sc=preprocessing.StandardScaler()
sc.fit(test_array)
X_norm=sc.transform(test_array)
X_norm

array([[ 0.97028821, -0.44073542, -0.35844986,  1.02610327, -0.78448471],
       [ 1.27347573, -0.76640128, -0.12263059, -0.74426461, -0.32741153],
       [-1.15202442,  0.86192804, -0.39464275,  0.81866668, -0.72735056],
       ...,
       [ 0.36391317, -1.09206714,  0.34137352, -1.35998547,  0.45342182],
       [-0.24246187, -1.09206714,  0.4078214 ,  0.437874  ,  0.26297467],
       [-0.24246187, -1.09206714,  0.10725079,  1.8070009 , -0.11791965]])

In [6]:
Spec_CL = [cluster.SpectralClustering(n_clusters=i, affinity="nearest_neighbors").fit(X_norm) for i in range(2,8)]
print("Spec_CL done")



Spec_CL done


In [7]:
for scl in Spec_CL:
    print(pd.DataFrame(scl.labels_)[0].value_counts())

0    34310
1       85
Name: 0, dtype: int64
0    34329
2       40
1       26
Name: 0, dtype: int64
0    34326
1       38
3       20
2       11
Name: 0, dtype: int64
0    21061
2    13090
1      191
3       30
4       23
Name: 0, dtype: int64
0    22099
2     6996
5     5022
3      191
4       50
1       37
Name: 0, dtype: int64
0    34090
1      191
5       42
6       28
2       20
4       14
3       10
Name: 0, dtype: int64


In [8]:
#ラベルを付けてOLS実行
df["Spcl_number"] = Spec_CL[4].labels_

In [9]:
df.head(2)

Unnamed: 0,徒歩(分),賃料・価格,所在階,Area,how_old,Spcl_number
40,9.0,65000.0,3.0,20.0,374.4,2
41,10.0,89000.0,2.0,28.34,114.666667,2


In [11]:
#全データ
import statsmodels.api as sm

X = df[["徒歩(分)","所在階","how_old"]]
X = sm.add_constant(X)
Y = df["賃料・価格"]/df["Area"]
model = sm.OLS(Y, X)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.190
Model:                            OLS   Adj. R-squared:                  0.190
Method:                 Least Squares   F-statistic:                     2684.
Date:                Mon, 31 Dec 2018   Prob (F-statistic):               0.00
Time:                        16:00:57   Log-Likelihood:            -2.6979e+05
No. Observations:               34395   AIC:                         5.396e+05
Df Residuals:                   34391   BIC:                         5.396e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3708.6805     10.663    347.809      0.0

In [13]:
for cl_num6 in [0,1,2,3,4,5]:
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("cl_n6_cl_{}".format(cl_num6))
    df_olsee = df[df["Spcl_number"] == cl_num6]
    X = df_olsee[["徒歩(分)","所在階","how_old"]]
    X = sm.add_constant(X)
    Y = df_olsee["賃料・価格"]/df_olsee["Area"]
    model = sm.OLS(Y, X)
    result = model.fit()
    print(result.summary())

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cl_n6_cl_0
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.236
Model:                            OLS   Adj. R-squared:                  0.236
Method:                 Least Squares   F-statistic:                     2272.
Date:                Mon, 31 Dec 2018   Prob (F-statistic):               0.00
Time:                        16:01:50   Log-Likelihood:            -1.7393e+05
No. Observations:               22099   AIC:                         3.479e+05
Df Residuals:                   22095   BIC:                         3.479e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.144
Model:                            OLS   Adj. R-squared:                  0.144
Method:                 Least Squares   F-statistic:                     281.7
Date:                Mon, 31 Dec 2018   Prob (F-statistic):          5.07e-169
Time:                        16:01:51   Log-Likelihood:                -38198.
No. Observations:                5022   AIC:                         7.640e+04
Df Residuals:                    5018   BIC:                         7.643e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3594.7681     31.588    113.800      0.0

In [14]:
scldes = df.groupby("Spcl_number")[["徒歩(分)","所在階","Area","賃料・価格","how_old"]].describe()
scldes.to_csv(r"../data/dev/ch4/athome_bunkyo_rent_scldes.csv")
scldes

Unnamed: 0_level_0,徒歩(分),徒歩(分),徒歩(分),徒歩(分),徒歩(分),徒歩(分),徒歩(分),徒歩(分),所在階,所在階,所在階,所在階,所在階,所在階,所在階,所在階,Area,Area,Area,Area,Area,Area,Area,Area,賃料・価格,賃料・価格,賃料・価格,賃料・価格,賃料・価格,賃料・価格,賃料・価格,賃料・価格,how_old,how_old,how_old,how_old,how_old,how_old,how_old,how_old
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Spcl_number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2
0,22099.0,5.350016,2.984253,1.0,3.0,5.0,7.0,40.0,22099.0,5.617494,3.107013,-1.0,3.0,5.0,7.0,24.0,22099.0,37.300749,43.08299,2.0,22.035,32.24,46.0,4812.0,22099.0,119796.386579,59528.82709,20000.0,79000.0,105000.0,148000.0,1800000.0,22099.0,233.844817,157.675292,-9.166667,113.566667,184.666667,348.033333,1401.133333
1,37.0,6.648649,2.030547,4.0,6.0,6.0,9.0,10.0,37.0,5.081081,2.861414,3.0,3.0,4.0,4.0,10.0,37.0,36.169189,23.295812,20.72,21.08,25.4,25.59,76.55,37.0,126023.243243,66688.810075,79400.0,80100.0,96500.0,99500.0,245000.0,37.0,66.32973,54.562601,-2.033333,1.033333,83.166667,90.3,137.966667
2,6996.0,8.966695,2.99991,5.0,7.0,8.0,10.0,31.0,6996.0,2.158662,1.025053,-1.0,1.0,2.0,3.0,12.0,6996.0,23.985596,7.294981,3.01,20.0,22.12,26.8525,55.5,6996.0,80394.263865,19564.414848,27000.0,68000.0,77000.0,88000.0,180000.0,6996.0,204.803878,123.579659,-3.066667,108.533333,168.4,305.366667,656.4
3,191.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,191.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0,191.0,23.019686,2.889084,16.2,21.0,22.67,25.09,29.5,191.0,82337.696335,9243.907976,60000.0,78000.0,82000.0,90000.0,99000.0,191.0,151.016056,51.288051,82.2,111.1,145.066667,197.316667,285.133333
4,50.0,5.8,1.714286,4.0,5.0,5.0,5.75,9.0,50.0,6.6,3.129843,1.0,7.0,7.0,9.0,9.0,50.0,33.5564,17.200279,19.87,25.05,25.46,30.1,70.18,50.0,120980.0,44238.449432,79000.0,94000.0,98000.0,119000.0,210000.0,50.0,78.002667,58.911336,-3.066667,0.958333,98.433333,117.125,151.2
5,5022.0,3.543011,1.37481,1.0,3.0,4.0,5.0,6.0,5022.0,1.871565,0.675069,1.0,1.0,2.0,2.0,4.0,5022.0,24.770763,7.220087,9.5,20.01,23.09,28.24,61.12,5022.0,82876.825966,19767.541349,29000.0,70000.0,80000.0,90000.0,171000.0,5022.0,211.844305,122.11164,-4.066667,120.7,186.183333,313.533333,672.666667
