In [1]:
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import acquire
from scipy import stats
import sklearn.preprocessing
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE
# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans

np.random.seed(123)

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Pasting a code I found from itertools documentation.

def combinations(iterable, r):
    # combinations('ABCD', 2) --> AB AC AD BC BD CD
    # combinations(range(4), 3) --> 012 013 023 123
    pool = tuple(iterable)
    n = len(pool)
    if r > n:
        return
    indices = list(range(r))
    yield tuple(pool[i] for i in indices)
    while True:
        for i in reversed(range(r)):
            if indices[i] != i + n - r:
                break
        else:
            return
        indices[i] += 1
        for j in range(i+1, r):
            indices[j] = indices[j-1] + 1
        yield tuple(pool[i] for i in indices)

In [3]:
zillow = acquire.get_zillow_data()

In [4]:
# Get a peek of the dataframe
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77574 entries, 0 to 77573
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77574 non-null  int64  
 1   parcelid                      77574 non-null  int64  
 2   airconditioningtypeid         25006 non-null  float64
 3   architecturalstyletypeid      206 non-null    float64
 4   basementsqft                  50 non-null     float64
 5   bathroomcnt                   77574 non-null  float64
 6   bedroomcnt                    77574 non-null  float64
 7   buildingclasstypeid           15 non-null     float64
 8   buildingqualitytypeid         49808 non-null  float64
 9   calculatedbathnbr             76959 non-null  float64
 10  decktypeid                    614 non-null    float64
 11  finishedfloor1squarefeet      6035 non-null   float64
 12  calculatedfinishedsquarefeet  77374 non-null  float64
 13  f

In [5]:
def create_features(df):
    df['age'] = 2017 - df.yearbuilt
    df['age_bin'] = pd.cut(df.age, 
                           bins = [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140],
                           labels = [0, .066, .133, .20, .266, .333, .40, .466, .533, 
                                     .60, .666, .733, .8, .866, .933])

    # create taxrate variable
    df['taxrate'] = df.taxamount/df.taxvaluedollarcnt*100

    # create acres variable
    df['acres'] = df.lotsizesquarefeet/43560

    # bin acres
    df['acres_bin'] = pd.cut(df.acres, bins = [0, .10, .15, .25, .5, 1, 5, 10, 20, 50, 200], 
                       labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9])

    # square feet bin
    df['sqft_bin'] = pd.cut(df.calculatedfinishedsquarefeet, 
                            bins = [0, 800, 1000, 1250, 1500, 2000, 2500, 3000, 4000, 7000, 12000],
                            labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
                       )

    # dollar per square foot-structure
    df['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt/df.calculatedfinishedsquarefeet


    df['structure_dollar_sqft_bin'] = pd.cut(df.structure_dollar_per_sqft, 
                                             bins = [0, 25, 50, 75, 100, 150, 200, 300, 500, 1000, 1500],
                                             labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
                                            )


    # dollar per square foot-land
    df['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet

    df['lot_dollar_sqft_bin'] = pd.cut(df.land_dollar_per_sqft, bins = [0, 1, 5, 20, 50, 100, 250, 500, 1000, 1500, 2000],
                                       labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
                                      )


    # update datatypes of binned values to be float
    df = df.astype({'sqft_bin': 'float64', 'acres_bin': 'float64', 'age_bin': 'float64',
                    'structure_dollar_sqft_bin': 'float64', 'lot_dollar_sqft_bin': 'float64'})


    # ratio of bathrooms to bedrooms
    df['bath_bed_ratio'] = df.bathroomcnt/df.bedroomcnt

    # 12447 is the ID for city of LA. 
    # I confirmed through sampling and plotting, as well as looking up a few addresses.
    df['cola'] = df['regionidcity'].apply(lambda x: 1 if x == 12447.0 else 0)

    return df

In [6]:
zillow = create_features(zillow)

In [7]:
# Create a function that will remove rows and columns that have missing values past a certain threshold.
def handle_missing_values(df, p_row = 0.84, p_col = 0.84):
    ''' function which takes in a dataframe, required notnull proportions of non-null rows and columns.
    drop the columns and rows columns based on theshold:'''
    
    #drop columns with nulls
    threshold = int(p_col * len(df.index)) # Require that many non-NA values.
    df.dropna(axis = 1, thresh = threshold, inplace = True)
    
    #drop rows with nulls
    threshold = int(p_row * len(df.columns)) # Require that many non-NA values.
    df.dropna(axis = 0, thresh = threshold, inplace = True)
    
    
    return df

In [8]:
b = handle_missing_values(zillow)

In [9]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77332 entries, 0 to 77573
Data columns (total 41 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77332 non-null  int64  
 1   parcelid                      77332 non-null  int64  
 2   bathroomcnt                   77332 non-null  float64
 3   bedroomcnt                    77332 non-null  float64
 4   calculatedbathnbr             76915 non-null  float64
 5   calculatedfinishedsquarefeet  77330 non-null  float64
 6   finishedsquarefeet12          73878 non-null  float64
 7   fips                          77332 non-null  float64
 8   fullbathcnt                   76915 non-null  float64
 9   latitude                      77332 non-null  float64
 10  longitude                     77332 non-null  float64
 11  lotsizesquarefeet             69202 non-null  float64
 12  propertycountylandusecode     77332 non-null  object 
 13  p

In [10]:
# Remove observations where bedroom and bathroom counts are zero.
b = b[b.bedroomcnt != 0]
b = b[b.bathroomcnt !=0]

In [11]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76675 entries, 0 to 77573
Data columns (total 41 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            76675 non-null  int64  
 1   parcelid                      76675 non-null  int64  
 2   bathroomcnt                   76675 non-null  float64
 3   bedroomcnt                    76675 non-null  float64
 4   calculatedbathnbr             76659 non-null  float64
 5   calculatedfinishedsquarefeet  76673 non-null  float64
 6   finishedsquarefeet12          73525 non-null  float64
 7   fips                          76675 non-null  float64
 8   fullbathcnt                   76659 non-null  float64
 9   latitude                      76675 non-null  float64
 10  longitude                     76675 non-null  float64
 11  lotsizesquarefeet             68561 non-null  float64
 12  propertycountylandusecode     76675 non-null  object 
 13  p

In [12]:
# Take a look at the amount of missing values.
b.isnull().sum()

id                                 0
parcelid                           0
bathroomcnt                        0
bedroomcnt                         0
calculatedbathnbr                 16
calculatedfinishedsquarefeet       2
finishedsquarefeet12            3150
fips                               0
fullbathcnt                       16
latitude                           0
longitude                          0
lotsizesquarefeet               8114
propertycountylandusecode          0
propertylandusetypeid              0
rawcensustractandblock             0
regionidcity                    1425
regionidcounty                     0
regionidzip                       23
roomcnt                            0
yearbuilt                         57
structuretaxvaluedollarcnt        70
taxvaluedollarcnt                  0
assessmentyear                     0
landtaxvaluedollarcnt              0
taxamount                          4
censustractandblock              191
logerror                           0
t

In [13]:
# Create a list of columns to drop.
columns_to_drop = ['calculatedbathnbr','finishedsquarefeet12','id','transactiondate','parcelid','fullbathcnt','propertycountylandusecode','propertylandusetypeid','rawcensustractandblock','regionidcounty','regionidzip','censustractandblock','propertylandusedesc']

In [14]:
# Create a function that will drop unwanted columns.
def drop_columns(df, drop_col):
    df = df.drop(columns=drop_col)
    return df

In [15]:
b = drop_columns(b, columns_to_drop)

In [16]:
b.isna().sum()

bathroomcnt                        0
bedroomcnt                         0
calculatedfinishedsquarefeet       2
fips                               0
latitude                           0
longitude                          0
lotsizesquarefeet               8114
regionidcity                    1425
roomcnt                            0
yearbuilt                         57
structuretaxvaluedollarcnt        70
taxvaluedollarcnt                  0
assessmentyear                     0
landtaxvaluedollarcnt              0
taxamount                          4
logerror                           0
age                               57
age_bin                           59
taxrate                            4
acres                           8114
acres_bin                       8114
sqft_bin                          12
structure_dollar_per_sqft         72
structure_dollar_sqft_bin         72
land_dollar_per_sqft            8114
lot_dollar_sqft_bin             8115
bath_bed_ratio                     0
c

### Drop more columns, that are captured in created features, before splitting.

In [17]:
# Drop columns that are captured in other features.
b.drop(columns = ['bedroomcnt', 'taxamount', 'taxvaluedollarcnt', 'fips', 
                                       'structure_dollar_per_sqft', 'land_dollar_per_sqft', 'yearbuilt', 
                                       'lotsizesquarefeet', 'regionidcity', 
                                       'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt'], inplace=True)

In [18]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76675 entries, 0 to 77573
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bathroomcnt                   76675 non-null  float64
 1   calculatedfinishedsquarefeet  76673 non-null  float64
 2   latitude                      76675 non-null  float64
 3   longitude                     76675 non-null  float64
 4   roomcnt                       76675 non-null  float64
 5   assessmentyear                76675 non-null  float64
 6   logerror                      76675 non-null  float64
 7   age                           76618 non-null  float64
 8   age_bin                       76616 non-null  float64
 9   taxrate                       76671 non-null  float64
 10  acres                         68561 non-null  float64
 11  acres_bin                     68561 non-null  float64
 12  sqft_bin                      76663 non-null  float64
 13  s

### There are a few more columns with missing values that I will just drop.

In [19]:
# split test off, 20% of original df size. 
train_validate, test = train_test_split(b, test_size=.2, 
                                        random_state=42)

# split validate off, 30% of what remains (24% of original df size)
# thus train will be 56% of original df size. 
train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=42)

print("train observations: ", train.shape)
print("validate observations: ", validate.shape)
print("test observations: ", test.shape)

train observations:  (42938, 17)
validate observations:  (18402, 17)
test observations:  (15335, 17)


In [20]:
train.isnull().sum()

bathroomcnt                        0
calculatedfinishedsquarefeet       1
latitude                           0
longitude                          0
roomcnt                            0
assessmentyear                     0
logerror                           0
age                               28
age_bin                           30
taxrate                            2
acres                           4573
acres_bin                       4573
sqft_bin                           6
structure_dollar_sqft_bin         44
lot_dollar_sqft_bin             4573
bath_bed_ratio                     0
cola                               0
dtype: int64

In [21]:
# I've decided to drop the null values instead of imputing. I will comment out the imputing lines of code.
train.dropna(inplace=True)
validate.dropna(inplace=True)
test.dropna(inplace=True)

In [None]:
#plt.figure(figsize=(10,16))

sns.relplot(train.longitude, train.latitude, hue=train.fips, height=15)

# Takeaway
* Initially I thought logerror would vary noticeably depending on the latitude and longitude. But, the logerror seems to be uniform all throughout the landscape.
* Maybe if I bin logerror I can get a better visualization.

In [None]:
# Create a bunch of bins for logerror to see if it will help with visualzing logerror when plotted on top of longitude vs latitude
train['logerror_bins'] = pd.cut(train.logerror, [-5, -.2, -.05, .05, .2, 4])



In [None]:
sns.relplot(train.longitude, train.latitude, hue=train.logerror_bins, height=10)

# Takeaways
* Binning the logerror made some of the observations stand out a little more, but I can't see any patterns as of now.
* Just call me Dora because I am about to...

### Explore!

### Target Variable: logerror
#### What will all our features look like plotted against the target (and everything else)?

In [None]:
sns.pairplot(train, corner=True, hue='fips')

In [None]:
plt.figure(figsize=(10, 18))

plt.subplot(221)
plt.scatter(train.bedroomcnt, train.logerror)
plt.ylabel('logerror')
plt.xlabel('bedrooms')

plt.subplot(222)
plt.scatter(train.bathroomcnt, train.logerror)
plt.ylabel('logerror')
plt.xlabel('bathrooms')

plt.subplot(223)
plt.scatter(train.calculatedfinishedsquarefeet, train.logerror)
plt.ylabel('logerror')
plt.xlabel('home_area')

plt.subplot(224)
plt.scatter(train.lotsizesquarefeet, train.logerror)
plt.ylabel('logerror')
plt.xlabel('land_area')

In [None]:
train.info()

# Takeaways

* There doesn't seem to be any linear trend when plotting the independent variable against our target.
* There is a large variance in logerror among the lower end of the sepctrum across all variables.
* Small land area seems to have higher logerror.

### Scale the data

In [147]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38285 entries, 18253 to 5668
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bathroomcnt                   38285 non-null  float64
 1   calculatedfinishedsquarefeet  38285 non-null  float64
 2   latitude                      38285 non-null  float64
 3   longitude                     38285 non-null  float64
 4   roomcnt                       38285 non-null  float64
 5   assessmentyear                38285 non-null  float64
 6   logerror                      38285 non-null  float64
 7   age                           38285 non-null  float64
 8   age_bin                       38285 non-null  float64
 9   taxrate                       38285 non-null  float64
 10  acres                         38285 non-null  float64
 11  acres_bin                     38285 non-null  float64
 12  sqft_bin                      38285 non-null  float64
 13

In [148]:
# Create a scaler object using the MinMaxScaler
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler on the train dataset
# Use the select_dtypes to avoid an error when it tries to fit on the datetime datatype
scaler.fit(train)

# Use the scaler to transform the datasets
# Use the select_dtypes to avoid an error when it tries to transform on the datetime datatype
#train_scaled = scaler.transform(train.select_dtypes(include='float'))
train_scaled = pd.DataFrame(scaler.transform(train), index = train.index, columns = train.columns)
#validate_scaled = scaler.transform(validate.select_dtypes(include='float'))
validate_scaled = pd.DataFrame(scaler.transform(validate), index = validate.index, columns = validate.columns)
#test_scaled = scaler.transform(test.select_dtypes(include='float'))
test_scaled = pd.DataFrame(scaler.transform(test), index = test.index, columns = test.columns)

In [167]:
X_train

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,acres_bin,sqft_bin,structure_dollar_sqft_bin,lot_dollar_sqft_bin,bath_bed_ratio,cola
18253,4.0,3836.0,34524767.0,-118341486.0,0.0,2016.0,34.0,0.266,1.122547,5.055854,0.6,0.7,0.4,0.0,1.333333,0
29637,2.5,1725.0,34269124.0,-118756089.0,6.0,2016.0,53.0,0.400,1.298319,0.151469,0.2,0.4,0.3,0.2,0.625000,0
66783,3.0,2256.0,33864940.0,-118378922.0,0.0,2016.0,55.0,0.400,1.233884,0.172176,0.2,0.5,0.0,0.5,0.600000,0
58524,2.0,1631.0,34559188.0,-118048053.0,0.0,2016.0,28.0,0.200,1.722238,0.162121,0.2,0.4,0.3,0.2,0.666667,0
3965,2.5,1472.0,33610842.0,-117621681.0,0.0,2016.0,27.0,0.200,1.271043,0.072314,0.0,0.3,0.4,0.4,0.833333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,1.5,2835.0,34248881.0,-119064191.0,7.0,2016.0,46.0,0.333,1.090153,0.298508,0.3,0.6,0.3,0.3,0.500000,0
16805,2.0,1479.0,33934472.0,-118284477.0,0.0,2016.0,76.0,0.533,1.245178,0.114968,0.1,0.3,0.2,0.3,0.500000,1
38915,2.0,1981.0,34700588.0,-118162001.0,0.0,2016.0,60.0,0.400,1.748088,0.160950,0.2,0.4,0.2,0.1,0.500000,0
45130,2.0,1036.0,33845399.0,-118295183.0,0.0,2016.0,64.0,0.466,1.281659,0.095202,0.0,0.2,0.2,0.4,0.666667,0


In [168]:
X_train_scaled.head()

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_550,cluster_551,cluster_552,cluster_553,cluster_554,cluster_555,cluster_556,cluster_557,cluster_558,cluster_559
18253,0.333333,0.313558,0.813292,0.590575,0.0,0.0,0.23913,0.285102,0.024585,0.03156,...,0,3,0,3,2,0,1,0,1,1
29637,0.166667,0.129384,0.637874,0.374641,0.4,0.0,0.376812,0.428725,0.028467,0.000913,...,1,1,0,3,0,0,1,0,1,1
66783,0.222222,0.175711,0.36053,0.571078,0.0,0.0,0.391304,0.428725,0.027044,0.001042,...,1,0,0,1,2,0,2,0,2,3
58524,0.111111,0.121183,0.836911,0.743402,0.0,0.0,0.195652,0.214362,0.037829,0.000979,...,1,1,0,3,0,0,1,0,1,1
3965,0.166667,0.107311,0.186172,0.965465,0.0,0.0,0.188406,0.214362,0.027864,0.000418,...,2,1,2,0,3,2,0,2,1,3


## Separate the target from the independent variables.

In [23]:
X_train = train.drop(columns=['logerror'])
X_validate = validate.drop(columns='logerror')
X_test = test.drop(columns='logerror')

In [25]:
def Min_Max_Scaler(X_train, X_validate, X_test):
    """
    Takes in X_train, X_validate and X_test dfs with numeric values only
    Returns scaler, X_train_scaled, X_validate_scaled, X_test_scaled dfs 
    """
    scaler = sklearn.preprocessing.MinMaxScaler().fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_validate_scaled = pd.DataFrame(scaler.transform(X_validate), index = X_validate.index, columns = X_validate.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    return scaler, X_train_scaled, X_validate_scaled, X_test_scaled

In [26]:
scaler, X_train_scaled, X_validate_scaled, X_test_scaled = Min_Max_Scaler(X_train, X_validate, X_test)

In [27]:
y_train = train[['logerror']]
y_validate = validate[['logerror']]
y_test = test[['logerror']]

### Choose top five original features using Select K Best

In [29]:
# Set the number of top features to present
k = 5

# Let's start with Select K Best
# Make the thing
kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=5)

# fit the thing
kbest.fit(X_train, y_train)

# use the thing, 
# get_support() produces an array of booleans, so we can filter out the column names that matter the most
kbest_features = X_train.columns[kbest.get_support()].tolist()

print("KBest's 5 best features are", kbest_features)

KBest's 5 best features are ['bathroomcnt', 'calculatedfinishedsquarefeet', 'longitude', 'sqft_bin', 'structure_dollar_sqft_bin']


### Choose top five original features using Recursive Feature Elimination

In [30]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=5)

# fit the data using RFE
rfe.fit(X_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [31]:
rfe_feature

['calculatedfinishedsquarefeet',
 'taxrate',
 'acres',
 'sqft_bin',
 'structure_dollar_sqft_bin']

# Set the baseline model using mean logerror.

In [32]:
# 1. Predict logerror mean
logerror_pred_mean = y_train['logerror'].mean()
y_train['logerror_pred_mean'] = logerror_pred_mean
y_validate['logerror_pred_mean'] = logerror_pred_mean

# 2. compute logerror_pred_median
logerror_pred_median = y_train['logerror'].median()
y_train['logerror_pred_median'] = logerror_pred_median
y_validate['logerror_pred_median'] = logerror_pred_median

# 3. RMSE of logerror_pred_mean
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_mean)**(1/2)
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_mean)**(1/2)

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 5), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 5))

# 4. RMSE of logerror_pred_median
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_median)**(1/2)
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_median)**(1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 5), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 5))

RMSE using Mean
Train/In-Sample:  0.16811 
Validate/Out-of-Sample:  0.17074
RMSE using Median
Train/In-Sample:  0.16845 
Validate/Out-of-Sample:  0.17094


# Try using a linear regression model (OLS)

In [33]:
# create the model object
lm = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column since we added additional columns to y_train.
lm.fit(X_train[rfe_feature], y_train.logerror)

# predict train
y_train['logerror_pred_lm'] = lm.predict(X_train[rfe_feature])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_lm)**(1/2)

# predict validate
y_validate['logerror_pred_lm'] = lm.predict(X_validate[rfe_feature])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_lm)**(1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for OLS using LinearRegression
Training/In-Sample:  0.16785751910382743 
Validation/Out-of-Sample:  0.17038545904051638


### Try using a LassoLars model

In [34]:
# create the model object
lars = LassoLars(alpha=1.2)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lars.fit(X_train[rfe_feature], y_train.logerror)

# predict train
y_train['logerror_pred_lars'] = lars.predict(X_train[rfe_feature])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_lars)**(1/2)

# predict validate
y_validate['logerror_pred_lars'] = lars.predict(X_validate[rfe_feature])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_lars)**(1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Lasso + Lars
Training/In-Sample:  0.16810796491510668 
Validation/Out-of-Sample:  0.17073853585433502


### Further exploration

#### What is the distribution of each of the  independent variables?

In [None]:
for col in train.columns:
    plt.figure(figsize=(4,2))
    plt.hist(train[col])
    plt.title(col)
    plt.show()

### Does the mean logerror differ among the three counties?
LA_County = 6037

Orange_County = 6059

Ventura_County = 6111

In [None]:
plt.title("Are Orange County Homes Consistently Being Overvalued?")
sns.barplot(x="fips", y="logerror", data=train)
population_logerror_mean = train.logerror.mean()
plt.axhline(population_logerror_mean, label="Population average logerror")
plt.legend()
plt.show()

# Takeaway
* The mean logerror of Orange County seems to be significantly higher than the population average.
* The mean logerror of Ventura County seems slightly higher than the population average.
* The mean logerror of LA County seems slightly lower than the population average.

### I will run a statistical test to compare the mean logerror between counties.

In [None]:
## Create a dataframe for each county.
#LA_county = train[train['fips']== 6037]
#Ventura_county = train[train['fips']== 6111]
#Orange_county = train[train['fips']== 6059]
#
## Create list of counties to make looping easier.
#county_list = [LA_county, Ventura_county, Orange_county]
#
## Verify shape of new dataframes.
#LA_county.shape, Ventura_county.shape, Orange_county.shape
#
## Create variables that hold the logerror values for each county.
#LA_logerror = LA_county['logerror']
#Ventura_logerror = Ventura_county['logerror']
#Orange_logerror = Orange_county['logerror']
#
#def create_features(df):
#    df['age'] = 2017 - df.yearbuilt
#    df['age_bin'] = pd.cut(df.age, 
#                           bins = [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140],
#                           labels = [0, .066, .133, .20, .266, .333, .40, .466, .533, 
#                                     .60, .666, .733, .8, .866, .933])
#
#    # create taxrate variable
#    df['taxrate'] = df.taxamount/df.taxvaluedollarcnt*100
#
#    # create acres variable
#    df['acres'] = df.lotsizesquarefeet/43560
#
#    # bin acres
#    df['acres_bin'] = pd.cut(df.acres, bins = [0, .10, .15, .25, .5, 1, 5, 10, 20, 50, 200], 
#                       labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9])
#
#    # square feet bin
#    df['sqft_bin'] = pd.cut(df.calculatedfinishedsquarefeet, 
#                            bins = [0, 800, 1000, 1250, 1500, 2000, 2500, 3000, 4000, 7000, 12000],
#                            labels =  [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
#                       )
#
#    # dollar per square foot-structure
#    df['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt/df.calculatedfinishedsquarefeet
#
#
#    df['structure_dollar_sqft_bin'] = pd.cut(df.structure_dollar_per_sqft, 
#                                             bins = [0, 25, 50, 75, 100, 150, 200, 300, 500, 1000, 1500],
#                                             labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
#                                            )
#
#
#    # dollar per square foot-land
#    df['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet
#
#    df['lot_dollar_sqft_bin'] = pd.cut(df.land_dollar_per_sqft, bins = [0, 1, 5, 20, 50, 100, 250, 500, 1000, 1500, 2000],
#                                       labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
#                                      )
#
#
#    # update datatypes of binned values to be float
#    df = df.astype({'sqft_bin': 'float64', 'acres_bin': 'float64', 'age_bin': 'float64',
#                    'structure_dollar_sqft_bin': 'float64', 'lot_dollar_sqft_bin': 'float64'})
#
#
#    # ratio of bathrooms to bedrooms
#    df['bath_bed_ratio'] = df.bathroomcnt/df.bedroomcnt
#
#    # 12447 is the ID for city of LA. 
#    # I confirmed through sampling and plotting, as well as looking up a few addresses.
#    df['cola'] = df['regionidcity'].apply(lambda x: 1 if x == 12447.0 else 0)
#
#    return df

In [None]:
# Plot a histogram of logerror for each county.

for county in county_list:
    plt.figure(figsize=(10,8))
    plt.hist('logerror', bins=100, data=county)
    plt.show()

# Takeaways
* The logerror in all three counties are normally(ish) distributed. I will continue with an Independent t-test(setting equal_var to False if I must).

#### $H_0$: The mean logerror of Orange County is equal to the mean logerror of Ventura County.
#### $H_a$: The mean logerror of Orange County is greater than the mean logerror of Ventura County.

In [None]:
# Check variances
LA_logerror.var(), Ventura_logerror.var(), Orange_logerror.var()

In [None]:
# A stats Levene test - returns p value. small p-value means unequal variances
print(stats.levene(LA_logerror, Ventura_logerror))
print(stats.levene(LA_logerror, Orange_logerror))
print(stats.levene(Ventura_logerror, Orange_logerror))
# high p-value suggests that the populations have equal variances

# Takeaway
* There are equal variances between LA County logerror and both Ventura County and Orange County logerror.
* There is an unqual variance between Ventura County logerror and Orange County logerror.
* I will set the equal_var argument to False for the Ventura_vs_Orange Independent T-Test.

In [None]:
# Set the level of significance (alpha):
alpha = 0.05

In [None]:
# Use a Kruskal-Wallis test to test for equality in mean logerror between the three counties.
t, p = stats.kruskal(LA_logerror, Ventura_logerror, Orange_logerror)
print(f'p-value: {p}')
if p < alpha:
    print("We reject the null hypothesis that the mean logerror of the three counties are equal.")
else:
    pinrt("We fail to reject the null hypothesis.")

# Takeaway
* There is a difference between the three counties in mean logerror. I will create dummy variables for the counties and use them as a features in my model. 

### From the pairplot I noticed that there was a greater variance in mean logerror for homes built after 1940. I will bin the train.yearbuilt into two bins 0: Houses built before 1940 and 1: Houses built after 1940.

### Takeaway
* It seems as if the variance in logerror increases for homes built after 1940.
* There seems to be 3 clusters. A middle cluster where most of the datapoints are situated, a top cluster, and a lower cluster.
* Maybe if create one bin from yearbuilt from yearbuilt.min() to about yearbuilt <= 1940 and one more bin for homes built after 1940 and compare the means between the two bins to see if there is a statistical difference.

# Speaking of clusters...

# *********************************************************************

In [35]:
run_this_through_the_triplet_function = list(X_train_scaled.columns)

In [36]:
# Get every possible combination of a set of three features
list_of_feature_combinations = list(combinations(run_this_through_the_triplet_function,3))

In [37]:
# Fit this with clustering? I think that's the correct term. What is the length?
len(list_of_feature_combinations)

560

### I have a list of 560 combinations of features to cluster on. If I divide it into groups of 100 I will have 560 (560/100) lists of features to work on at a time as to not overload the computer. I will print the [{inertia vs k}] graph for each group of ten and choose a k value based on the graphs. After I have my k value for each group I will assign that to the number of centroids for each group.

In [38]:
X_train_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,acres_bin,sqft_bin,structure_dollar_sqft_bin,lot_dollar_sqft_bin,bath_bed_ratio,cola
18253,0.333333,0.313558,0.813292,0.590575,0.000000,0.0,0.239130,0.285102,0.024585,0.031560,0.666667,0.777778,0.444444,0.000000,0.236111,0.0
29637,0.166667,0.129384,0.637874,0.374641,0.400000,0.0,0.376812,0.428725,0.028467,0.000913,0.222222,0.444444,0.333333,0.222222,0.088542,0.0
66783,0.222222,0.175711,0.360530,0.571078,0.000000,0.0,0.391304,0.428725,0.027044,0.001042,0.222222,0.555556,0.000000,0.555556,0.083333,0.0
58524,0.111111,0.121183,0.836911,0.743402,0.000000,0.0,0.195652,0.214362,0.037829,0.000979,0.222222,0.444444,0.333333,0.222222,0.097222,0.0
3965,0.166667,0.107311,0.186172,0.965465,0.000000,0.0,0.188406,0.214362,0.027864,0.000418,0.000000,0.333333,0.444444,0.444444,0.131944,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,0.055556,0.226226,0.623984,0.214175,0.466667,0.0,0.326087,0.356913,0.023869,0.001832,0.333333,0.666667,0.333333,0.333333,0.062500,0.0
16805,0.111111,0.107922,0.408241,0.620267,0.000000,0.0,0.543478,0.571275,0.027293,0.000685,0.111111,0.333333,0.222222,0.333333,0.062500,1.0
38915,0.111111,0.151719,0.933938,0.684055,0.000000,0.0,0.427536,0.428725,0.038400,0.000972,0.222222,0.444444,0.222222,0.111111,0.062500,0.0
45130,0.111111,0.069272,0.347121,0.614691,0.000000,0.0,0.456522,0.499464,0.028099,0.000561,0.000000,0.222222,0.222222,0.444444,0.097222,0.0


In [40]:
# I will convert all the tuples into lists and then to eventually Pandas dataframes.
# I'm not sure how long this will take. So I will test it on the first 10.
first_hundred = list_of_feature_combinations[:100]
second_hundred = list_of_feature_combinations[100:200]
third_hundred = list_of_feature_combinations[200:300]
fourth_hundred = list_of_feature_combinations[300:400]
fifth_hundred = list_of_feature_combinations[400:500]
last_sixty = list_of_feature_combinations[500:560]

In [None]:
## THIS FUNCTION IS CURRENTLY UNDER CONSTRUCTION. PLEASE DO NOT ENTER... UNLESS YOU WANT TO PEFECT IT FOR ME.
# This function will take in a list of feature combinations which will be used for clustering. It will 
# return a list of the top 100 features:
#def get_rfe_ranking_for_bdoggs_clusters(hunnid):
#    # Create a list of the 100 features.
#    list_of_hundred_features = []
#    for combo in hunnid:
#        list_of_hundred_features.append(list(combo))
#    # Convert into dataframes.
#    list_of_dataframes = []
#    for combo in list_of_first_hundred_features:
#        list_of_dataframes.append(X_train[combo])
#    # plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
#    # 100 dataframes will take ~ 9 minutes and 10 seconds to render  & CPU % up to 675
#    # *** ONLY RUN IF YOU HAVE TIME! ***
#    for combo in list_of_dataframes:
#        with plt.style.context('seaborn-whitegrid'):
#            plt.figure(figsize=(9, 6))
#            pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
#            plt.xticks(range(2, 12))
#            plt.xlabel('k')
#            plt.ylabel('inertia')
#            plt.title('Change in inertia as k increases')
#    
#    # This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.
#
#    # This cell takes ~ 30 seconds  to render with 100 clusters.
#
#    # set i to be the iteration count for number_of_centroids index
#    # set n to be the iteration count for list_of_dataframes index
#    i = 0
#    k = 0
#    list_of_cluster_predictions = []
#    for combo in list_of_dataframes: 
#        kmeans = KMeans(n_clusters = number_of_centroids[k])
#        kmeans.fit(list_of_dataframes[i])
#        X_train_scaled['cluster_' + str(i)] = kmeans.predict(combo)
#        list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
#        i += 1
#        k += 1
#    # Initialize the linear regression object!
#    lm = LinearRegression()
#
#    # Initialize the RFE object, setting the hyperparameters to be our linear regression object created above
#    # (as the algorithm to test the features on) and the number of features to return to be 5.
#    rfe = RFE(lm, n_features_to_select=16)
#
#    # Fit the RFE object to our data. This means create multiple linear regression models, find the one that 
#    # performs best, and identify the features that are used in that model. Those are the features we want.
#    # Transform our X dataframe to include only those 2 features. .transform() or do both of those steps together 
#    # with .fit_transform()
#
#    # Transforming data using RFE.
#    X_rfe = rfe.fit_transform(X_train_scaled,y_train)  
#
#    #Fitting the data to model.
#    lm.fit(X_rfe,y_train)
#    
#    # Create boolean mask.
#    mask = rfe.support_
#    
#    # Store top 100 features into this variable.
#    rfe_features = X_train_scaled.loc[:, mask].columns.tolist()
#    
#    return rfe_features

# First Hundred

In [41]:
# Create a list of the first 100 features.
list_of_hundred_features = []
for combo in first_hundred:
    list_of_hundred_features.append(list(combo))

In [42]:
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_train_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')
        

In [43]:
# Create a list for the k values to pass into a for loop.
number_of_centroids = [5,3,4,4,4,5,4,4,5,4,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,4,4,4,3,4,4,3,3,4,6,4,5,4,6,5,4,4,4,5,4,5,4,6,6,4,5,4,4,4,4,5,7,4,5,5,4,5,5,4,4,6,4,3,4,3,5,4,4,4,3,3,4,4,3,4,3,4,4,4,3,3,3,3,3,3,3,4,4,3]

len(number_of_centroids)

100

In [44]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 0
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_train_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [45]:
X_train_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_90,cluster_91,cluster_92,cluster_93,cluster_94,cluster_95,cluster_96,cluster_97,cluster_98,cluster_99
18253,0.333333,0.313558,0.813292,0.590575,0.000000,0.0,0.239130,0.285102,0.024585,0.031560,...,1,1,1,1,0,1,0,3,2,2
29637,0.166667,0.129384,0.637874,0.374641,0.400000,0.0,0.376812,0.428725,0.028467,0.000913,...,0,2,2,0,2,2,1,2,0,2
66783,0.222222,0.175711,0.360530,0.571078,0.000000,0.0,0.391304,0.428725,0.027044,0.001042,...,1,0,0,2,2,0,0,0,2,1
58524,0.111111,0.121183,0.836911,0.743402,0.000000,0.0,0.195652,0.214362,0.037829,0.000979,...,0,2,2,0,2,2,1,2,0,2
3965,0.166667,0.107311,0.186172,0.965465,0.000000,0.0,0.188406,0.214362,0.027864,0.000418,...,0,2,0,0,2,2,2,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,0.055556,0.226226,0.623984,0.214175,0.466667,0.0,0.326087,0.356913,0.023869,0.001832,...,1,1,2,0,0,1,0,0,2,0
16805,0.111111,0.107922,0.408241,0.620267,0.000000,0.0,0.543478,0.571275,0.027293,0.000685,...,0,0,2,0,1,0,2,2,3,1
38915,0.111111,0.151719,0.933938,0.684055,0.000000,0.0,0.427536,0.428725,0.038400,0.000972,...,0,0,2,0,2,0,1,2,0,1
45130,0.111111,0.069272,0.347121,0.614691,0.000000,0.0,0.456522,0.499464,0.028099,0.000561,...,0,0,0,0,2,0,2,1,0,0


# Second Hundred 

In [46]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in second_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_train_scaled[combo])

In [None]:

# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# 100 dataframes will take ~ 9 minutes and 10 seconds to render  & CPU % up to 675
# *** ONLY RUN IF YOU HAVE TIME! ***
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')



In [47]:
number_of_centroids = [4,4,4,4,4,4,4,4,4,4,4,4,3,3,4,4,4,4,4,4,3,3,3,4,4,3,3,4,3,3,2,3,4,4,3,3,3,4,4,4,4,4,3,4,4,4,4,3,4,4,4,3,3,3,3,3,3,4,3,4,4,4,4,4,4,4,3,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,4,4,3,3,4,4,4,4,5,3]

len(number_of_centroids)

100

In [48]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.
# set i to be the iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 100
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_train_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [None]:
X_train_scaled

# Third Hundred

In [49]:
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in third_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_train_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# 100 dataframes will take ~ 9 minutes and 10 seconds to render  & CPU % up to 675
# *** ONLY RUN IF YOU HAVE TIME! ***
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [50]:
number_of_centroids = [4,4,5,4,4,4,4,4,3,5,4,4,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,4,3,4,3,3,4,4,4,4,4,4,4,4,4,4,4,3,3,4,3,4,3,4,4,4,4,4,4,4,3,3,4,3,4,3,4,4,3,3,4,4,3,3,4,4,3,4,3,3,3,4,4,4,4,4,4,4,4,4,4,4,3,3,4,3,3,4,2,3,3]

len(number_of_centroids)

100

In [51]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.
# set i to be the iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 200
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_train_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1


In [52]:
X_train_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_290,cluster_291,cluster_292,cluster_293,cluster_294,cluster_295,cluster_296,cluster_297,cluster_298,cluster_299
18253,0.333333,0.313558,0.813292,0.590575,0.000000,0.0,0.239130,0.285102,0.024585,0.031560,...,3,2,1,0,2,2,1,1,2,1
29637,0.166667,0.129384,0.637874,0.374641,0.400000,0.0,0.376812,0.428725,0.028467,0.000913,...,1,2,0,2,2,2,3,0,1,1
66783,0.222222,0.175711,0.360530,0.571078,0.000000,0.0,0.391304,0.428725,0.027044,0.001042,...,1,1,0,0,2,1,0,0,1,1
58524,0.111111,0.121183,0.836911,0.743402,0.000000,0.0,0.195652,0.214362,0.037829,0.000979,...,2,0,1,1,1,0,2,1,0,2
3965,0.166667,0.107311,0.186172,0.965465,0.000000,0.0,0.188406,0.214362,0.027864,0.000418,...,2,0,2,3,1,0,2,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,0.055556,0.226226,0.623984,0.214175,0.466667,0.0,0.326087,0.356913,0.023869,0.001832,...,1,2,0,2,2,2,3,0,2,1
16805,0.111111,0.107922,0.408241,0.620267,0.000000,0.0,0.543478,0.571275,0.027293,0.000685,...,0,1,0,1,0,1,0,0,1,0
38915,0.111111,0.151719,0.933938,0.684055,0.000000,0.0,0.427536,0.428725,0.038400,0.000972,...,2,1,1,1,1,1,0,0,1,0
45130,0.111111,0.069272,0.347121,0.614691,0.000000,0.0,0.456522,0.499464,0.028099,0.000561,...,0,1,0,1,1,1,0,0,1,0


# Fourth Hundred

In [53]:
# Running this cell took 11 minutes and 55.51 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in fourth_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_train_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# 100 dataframes will take ~ 9 minutes and 10 seconds to render  & CPU % up to 675
# *** ONLY RUN IF YOU HAVE TIME! ***
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [54]:
number_of_centroids = [4,4,3,3,3,4,4,3,3,3,3,3,4,4,3,4,4,3,3,3,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,4,3,4,3,3,4,3,2,2,4,3,3,4,4,4,3,3,4,5,3,4,3,3,3,4,3,3,4,4,3,3,3,3,3,4,4,3,4,4,3,3,4,4,4,3,3,4,3,3,3,4,4,3,3,4,4,4,3,3,4,4,3,4,4,4]

len(number_of_centroids)

100

In [55]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.
# set i to be the iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 300
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_train_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1


In [56]:
X_train_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_390,cluster_391,cluster_392,cluster_393,cluster_394,cluster_395,cluster_396,cluster_397,cluster_398,cluster_399
18253,0.333333,0.313558,0.813292,0.590575,0.000000,0.0,0.239130,0.285102,0.024585,0.031560,...,2,3,0,0,2,0,2,3,3,1
29637,0.166667,0.129384,0.637874,0.374641,0.400000,0.0,0.376812,0.428725,0.028467,0.000913,...,3,2,1,2,3,3,0,2,2,3
66783,0.222222,0.175711,0.360530,0.571078,0.000000,0.0,0.391304,0.428725,0.027044,0.001042,...,1,0,2,0,2,3,0,2,2,3
58524,0.111111,0.121183,0.836911,0.743402,0.000000,0.0,0.195652,0.214362,0.037829,0.000979,...,2,3,0,0,2,2,2,0,0,0
3965,0.166667,0.107311,0.186172,0.965465,0.000000,0.0,0.188406,0.214362,0.027864,0.000418,...,2,3,2,0,2,2,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,0.055556,0.226226,0.623984,0.214175,0.466667,0.0,0.326087,0.356913,0.023869,0.001832,...,3,2,1,2,0,0,0,3,2,1
16805,0.111111,0.107922,0.408241,0.620267,0.000000,0.0,0.543478,0.571275,0.027293,0.000685,...,1,1,2,1,1,3,1,2,1,2
38915,0.111111,0.151719,0.933938,0.684055,0.000000,0.0,0.427536,0.428725,0.038400,0.000972,...,1,0,0,0,2,3,0,2,2,3
45130,0.111111,0.069272,0.347121,0.614691,0.000000,0.0,0.456522,0.499464,0.028099,0.000561,...,1,0,2,0,2,3,0,2,2,2


# Fifth Hundred

In [57]:
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in fifth_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_train_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# 100 dataframes will take ~ 9 minutes and 10 seconds to render  & CPU % up to 675
# *** ONLY RUN IF YOU HAVE TIME! ***
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [58]:
number_of_centroids = [4,4,4,4,4,4,4,3,4,4,3,4,4,4,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,4,4,3,3,3,3,3,4,2,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3,2,3,4,4,3,3,4,4,3,3,4,4,4,3,3,4,4,4,4,4,3,4,3,3,4,3,4,4,4,3,3,4,4,3,4,4,3,4,3,3]

len(number_of_centroids)

100

In [59]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.
# set i to be the iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 400
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_train_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [None]:
X_train_scaled

# Last 60

In [60]:
# Running this cell took 11 minutes and 55.51 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in last_sixty:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_train_scaled[combo])

In [None]:

# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# 100 dataframes will take ~ 9 minutes and 10 seconds to render  & CPU % up to 675
# *** ONLY RUN IF YOU HAVE TIME! ***
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [61]:
number_of_centroids = [4,4,3,3,3,4,4,4,3,4,3,4,3,4,4,3,4,4,4,3,3,3,4,3,4,3,4,3,4,4,3,4,4,4,3,4,4,4,4,4,4,3,3,3,4,3,4,3,3,3,3,4,3,4,4,3,3,3,3,4]

len(number_of_centroids)

60

In [62]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.
# set i to be the iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 500
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_train_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1
    


In [63]:
# FINALLY FINISHED!!!  PART 2!... Part 6!
X_train_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_550,cluster_551,cluster_552,cluster_553,cluster_554,cluster_555,cluster_556,cluster_557,cluster_558,cluster_559
18253,0.333333,0.313558,0.813292,0.590575,0.000000,0.0,0.239130,0.285102,0.024585,0.031560,...,0,3,0,3,2,0,1,0,1,1
29637,0.166667,0.129384,0.637874,0.374641,0.400000,0.0,0.376812,0.428725,0.028467,0.000913,...,1,1,0,3,0,0,1,0,1,1
66783,0.222222,0.175711,0.360530,0.571078,0.000000,0.0,0.391304,0.428725,0.027044,0.001042,...,1,0,0,1,2,0,2,0,2,3
58524,0.111111,0.121183,0.836911,0.743402,0.000000,0.0,0.195652,0.214362,0.037829,0.000979,...,1,1,0,3,0,0,1,0,1,1
3965,0.166667,0.107311,0.186172,0.965465,0.000000,0.0,0.188406,0.214362,0.027864,0.000418,...,2,1,2,0,3,2,0,2,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,0.055556,0.226226,0.623984,0.214175,0.466667,0.0,0.326087,0.356913,0.023869,0.001832,...,0,3,0,1,2,0,0,2,1,3
16805,0.111111,0.107922,0.408241,0.620267,0.000000,0.0,0.543478,0.571275,0.027293,0.000685,...,1,0,1,0,1,1,2,1,0,0
38915,0.111111,0.151719,0.933938,0.684055,0.000000,0.0,0.427536,0.428725,0.038400,0.000972,...,1,0,2,3,0,0,2,0,2,1
45130,0.111111,0.069272,0.347121,0.614691,0.000000,0.0,0.456522,0.499464,0.028099,0.000561,...,1,0,2,0,3,2,0,2,2,3


# Now do for validate
# First Hundred

In [64]:
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in first_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# Running this cell took 11 minutes and 55.51 seconds. CPU % ~ 730 for this cell.
# plot delta(inertia) graphs for each group of clusters. Might want to ask Ryan if it's possible to cache 
# the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [65]:
number_of_centroids = [4,3,5,4,3,4,4,4,3,3,4,3,5,3,3,5,3,4,4,3,4,4,4,4,4,5,3,3,3,4,4,4,3,3,4,4,3,5,4,2,4,3,4,5,3,3,4,4,5,3,3,4,3,3,4,4,4,3,5,3,2,3,4,4,4,4,4,5,3,3,2,2,4,3,3,4,3,3,4,4,3,3,5,3,4,4,4,4,5,3,4,4,4,5,3,4,4,5,3,3]

len(number_of_centroids)

100

In [66]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 0
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Second Hundred

In [67]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in second_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [68]:
number_of_centroids = [5,3,5,3,3,3,3,3,4,4,4,4,3,4,4,3,5,3,4,3,3,4,4,3,3,3,3,4,5,4,3,3,5,3,4,4,3,3,3,4,3,3,4,3,3,4,4,3,3,4,3,4,4,4,4,4,4,4,5,3,3,4,4,4,4,4,5,3,4,4,4,4,4,5,3,2,4,4,4,5,3,3,2,3,3,3,4,4,5,3,3,4,3,4,3,3,3,3,3,4]

len(number_of_centroids)

100

In [69]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 100
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [70]:
X_validate_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_190,cluster_191,cluster_192,cluster_193,cluster_194,cluster_195,cluster_196,cluster_197,cluster_198,cluster_199
68142,0.111111,0.076601,0.666255,0.549087,0.000000,0.0,0.405797,0.428725,0.027236,0.000833,...,2,0,1,0,0,1,1,1,1,2
76543,0.333333,0.206770,0.565016,0.695367,0.000000,0.0,0.492754,0.499464,0.027081,0.001299,...,2,2,2,3,1,2,1,2,2,1
54903,0.222222,0.109318,0.733715,0.479214,0.000000,0.0,0.224638,0.285102,0.031625,0.000608,...,2,0,2,0,1,0,1,1,1,2
38106,0.111111,0.111499,0.617138,0.524496,0.000000,0.0,0.268116,0.285102,0.026764,0.000849,...,0,2,1,0,0,1,1,1,1,2
18005,0.111111,0.092654,0.332643,0.565829,0.000000,0.0,0.297101,0.356913,0.026401,0.002119,...,0,0,2,0,2,0,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6182,0.055556,0.063078,0.552977,0.145526,0.266667,0.0,0.304348,0.356913,0.025961,0.000216,...,2,2,0,3,1,0,2,1,1,2
40343,0.000000,0.039958,0.481820,0.631453,0.000000,0.0,0.065217,0.070740,0.027811,0.005877,...,0,0,1,0,0,1,1,2,1,2
74104,0.222222,0.154947,0.452482,0.631855,0.000000,0.0,0.768116,0.785638,0.027346,0.000912,...,1,2,1,0,0,1,1,2,2,1
30559,0.000000,0.084802,0.347794,0.684135,0.000000,0.0,0.442029,0.499464,0.036411,0.000698,...,1,1,0,1,2,0,1,2,2,1


# Third Hundred

In [71]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in third_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [72]:
number_of_centroids = [4,3,3,3,3,3,5,4,2,3,4,3,4,4,4,4,3,4,3,4,4,3,3,4,4,4,4,5,3,3,4,3,2,3,3,3,4,3,3,3,3,3,3,3,4,3,4,3,4,4,4,5,3,3,3,3,4,4,3,3,3,4,4,3,3,3,4,3,3,5,3,5,3,3,3,4,4,4,4,3,3,4,3,4,4,3,3,4,3,3,3,4,4,4,4,4,4,3,3,3]

len(number_of_centroids)

100

In [73]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 200
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [74]:
X_validate_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_290,cluster_291,cluster_292,cluster_293,cluster_294,cluster_295,cluster_296,cluster_297,cluster_298,cluster_299
68142,0.111111,0.076601,0.666255,0.549087,0.000000,0.0,0.405797,0.428725,0.027236,0.000833,...,0,3,3,1,0,3,3,0,2,2
76543,0.333333,0.206770,0.565016,0.695367,0.000000,0.0,0.492754,0.499464,0.027081,0.001299,...,1,2,3,2,1,3,3,0,2,0
54903,0.222222,0.109318,0.733715,0.479214,0.000000,0.0,0.224638,0.285102,0.031625,0.000608,...,0,3,3,1,1,0,1,2,0,1
38106,0.111111,0.111499,0.617138,0.524496,0.000000,0.0,0.268116,0.285102,0.026764,0.000849,...,2,3,0,1,0,0,1,2,0,1
18005,0.111111,0.092654,0.332643,0.565829,0.000000,0.0,0.297101,0.356913,0.026401,0.002119,...,0,3,0,1,1,0,1,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6182,0.055556,0.063078,0.552977,0.145526,0.266667,0.0,0.304348,0.356913,0.025961,0.000216,...,0,1,2,3,2,0,2,2,0,1
40343,0.000000,0.039958,0.481820,0.631453,0.000000,0.0,0.065217,0.070740,0.027811,0.005877,...,0,3,0,2,0,2,1,1,0,2
74104,0.222222,0.154947,0.452482,0.631855,0.000000,0.0,0.768116,0.785638,0.027346,0.000912,...,2,0,3,2,0,1,3,0,2,2
30559,0.000000,0.084802,0.347794,0.684135,0.000000,0.0,0.442029,0.499464,0.036411,0.000698,...,0,0,0,2,1,3,3,0,2,2


# Fourth Hundred

In [75]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in fourth_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [76]:
number_of_centroids = [3,3,4,4,4,4,4,3,3,4,3,4,4,4,4,4,4,4,4,3,3,3,3,4,4,3,3,3,4,4,3,3,5,4,3,5,4,4,4,3,3,4,3,2,2,3,3,2,4,3,4,3,3,4,4,3,3,5,3,3,5,4,4,5,5,5,3,3,3,3,3,4,4,3,3,3,4,4,4,3,3,4,3,4,3,3,3,4,3,3,4,3,3,3,3,4,3,3,4,4]

len(number_of_centroids)

100

In [77]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 300
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Fifth Hundred

In [78]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in fifth_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [79]:
number_of_centroids = [4,4,5,3,4,4,4,4,4,4,4,3,3,3,3,3,3,5,3,4,4,4,4,5,3,4,4,3,5,3,3,4,4,3,4,5,3,5,3,3,3,3,3,3,3,3,5,3,4,4,4,4,4,5,3,3,4,2,3,4,3,2,3,3,5,3,3,2,5,3,3,5,3,5,3,2,3,3,3,3,3,5,3,5,3,3,4,5,3,3,3,5,5,3,3,3,5,3,3,5]

len(number_of_centroids)

100

In [80]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 400
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Last 60

In [81]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in last_sixty:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [82]:
number_of_centroids = [3,5,3,3,4,4,4,4,5,3,4,4,4,5,3,4,4,5,3,4,5,3,5,3,3,3,3,3,4,3,3,3,5,3,4,4,3,5,3,3,4,4,5,3,4,5,3,5,3,3,3,5,3,5,3,3,5,3,3,3]

len(number_of_centroids)

60

In [83]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 500
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Now do for test
# First Hundred

In [179]:
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in first_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_test_scaled[combo])

In [None]:
# Running this cell took 11 minutes and 55.51 seconds. CPU % ~ 730 for this cell.
# plot delta(inertia) graphs for each group of clusters. Might want to ask Ryan if it's possible to cache 
# the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [180]:
number_of_centroids = [4,3,5,4,3,4,4,4,3,3,4,3,5,3,3,5,3,4,4,3,4,4,4,4,4,5,3,3,3,4,4,4,3,3,4,4,3,5,4,2,4,3,4,5,3,3,4,4,5,3,3,4,3,3,4,4,4,3,5,3,2,3,4,4,4,4,4,5,3,3,2,2,4,3,3,4,3,3,4,4,3,3,5,3,4,4,4,4,5,3,4,4,4,5,3,4,4,5,3,3]

len(number_of_centroids)

100

In [181]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 0
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_test_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Second Hundred

In [182]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in second_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_test_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [183]:
number_of_centroids = [5,3,5,3,3,3,3,3,4,4,4,4,3,4,4,3,5,3,4,3,3,4,4,3,3,3,3,4,5,4,3,3,5,3,4,4,3,3,3,4,3,3,4,3,3,4,4,3,3,4,3,4,4,4,4,4,4,4,5,3,3,4,4,4,4,4,5,3,4,4,4,4,4,5,3,2,4,4,4,5,3,3,2,3,3,3,4,4,5,3,3,4,3,4,3,3,3,3,3,4]

len(number_of_centroids)

100

In [184]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 100
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_test_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [70]:
X_validate_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_190,cluster_191,cluster_192,cluster_193,cluster_194,cluster_195,cluster_196,cluster_197,cluster_198,cluster_199
68142,0.111111,0.076601,0.666255,0.549087,0.000000,0.0,0.405797,0.428725,0.027236,0.000833,...,2,0,1,0,0,1,1,1,1,2
76543,0.333333,0.206770,0.565016,0.695367,0.000000,0.0,0.492754,0.499464,0.027081,0.001299,...,2,2,2,3,1,2,1,2,2,1
54903,0.222222,0.109318,0.733715,0.479214,0.000000,0.0,0.224638,0.285102,0.031625,0.000608,...,2,0,2,0,1,0,1,1,1,2
38106,0.111111,0.111499,0.617138,0.524496,0.000000,0.0,0.268116,0.285102,0.026764,0.000849,...,0,2,1,0,0,1,1,1,1,2
18005,0.111111,0.092654,0.332643,0.565829,0.000000,0.0,0.297101,0.356913,0.026401,0.002119,...,0,0,2,0,2,0,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6182,0.055556,0.063078,0.552977,0.145526,0.266667,0.0,0.304348,0.356913,0.025961,0.000216,...,2,2,0,3,1,0,2,1,1,2
40343,0.000000,0.039958,0.481820,0.631453,0.000000,0.0,0.065217,0.070740,0.027811,0.005877,...,0,0,1,0,0,1,1,2,1,2
74104,0.222222,0.154947,0.452482,0.631855,0.000000,0.0,0.768116,0.785638,0.027346,0.000912,...,1,2,1,0,0,1,1,2,2,1
30559,0.000000,0.084802,0.347794,0.684135,0.000000,0.0,0.442029,0.499464,0.036411,0.000698,...,1,1,0,1,2,0,1,2,2,1


# Third Hundred

In [185]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in third_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_test_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [186]:
number_of_centroids = [4,3,3,3,3,3,5,4,2,3,4,3,4,4,4,4,3,4,3,4,4,3,3,4,4,4,4,5,3,3,4,3,2,3,3,3,4,3,3,3,3,3,3,3,4,3,4,3,4,4,4,5,3,3,3,3,4,4,3,3,3,4,4,3,3,3,4,3,3,5,3,5,3,3,3,4,4,4,4,3,3,4,3,4,4,3,3,4,3,3,3,4,4,4,4,4,4,3,3,3]

len(number_of_centroids)

100

In [187]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 200
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_test_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [74]:
X_validate_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_290,cluster_291,cluster_292,cluster_293,cluster_294,cluster_295,cluster_296,cluster_297,cluster_298,cluster_299
68142,0.111111,0.076601,0.666255,0.549087,0.000000,0.0,0.405797,0.428725,0.027236,0.000833,...,0,3,3,1,0,3,3,0,2,2
76543,0.333333,0.206770,0.565016,0.695367,0.000000,0.0,0.492754,0.499464,0.027081,0.001299,...,1,2,3,2,1,3,3,0,2,0
54903,0.222222,0.109318,0.733715,0.479214,0.000000,0.0,0.224638,0.285102,0.031625,0.000608,...,0,3,3,1,1,0,1,2,0,1
38106,0.111111,0.111499,0.617138,0.524496,0.000000,0.0,0.268116,0.285102,0.026764,0.000849,...,2,3,0,1,0,0,1,2,0,1
18005,0.111111,0.092654,0.332643,0.565829,0.000000,0.0,0.297101,0.356913,0.026401,0.002119,...,0,3,0,1,1,0,1,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6182,0.055556,0.063078,0.552977,0.145526,0.266667,0.0,0.304348,0.356913,0.025961,0.000216,...,0,1,2,3,2,0,2,2,0,1
40343,0.000000,0.039958,0.481820,0.631453,0.000000,0.0,0.065217,0.070740,0.027811,0.005877,...,0,3,0,2,0,2,1,1,0,2
74104,0.222222,0.154947,0.452482,0.631855,0.000000,0.0,0.768116,0.785638,0.027346,0.000912,...,2,0,3,2,0,1,3,0,2,2
30559,0.000000,0.084802,0.347794,0.684135,0.000000,0.0,0.442029,0.499464,0.036411,0.000698,...,0,0,0,2,1,3,3,0,2,2


# Fourth Hundred

In [193]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in fourth_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [194]:
number_of_centroids = [3,3,4,4,4,4,4,3,3,4,3,4,4,4,4,4,4,4,4,3,3,3,3,4,4,3,3,3,4,4,3,3,5,4,3,5,4,4,4,3,3,4,3,2,2,3,3,2,4,3,4,3,3,4,4,3,3,5,3,3,5,4,4,5,5,5,3,3,3,3,3,4,4,3,3,3,4,4,4,3,3,4,3,4,3,3,3,4,3,3,4,3,3,3,3,4,3,3,4,4]

len(number_of_centroids)

100

In [192]:
X_test_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_290,cluster_291,cluster_292,cluster_293,cluster_294,cluster_295,cluster_296,cluster_297,cluster_298,cluster_299
42377,0.222222,0.139155,0.332369,0.620181,0.000000,0.0,0.086957,0.142551,0.028926,0.004138,...,2,3,0,3,3,0,0,2,0,0
5602,0.000000,0.085238,0.338989,0.713830,0.000000,0.0,0.463768,0.499464,0.029111,0.000723,...,0,3,3,0,3,1,1,1,1,1
43594,0.111111,0.158262,0.622026,0.481283,0.000000,0.0,0.420290,0.428725,0.026664,0.002582,...,2,2,3,3,1,1,1,2,0,2
32058,0.222222,0.081225,0.556751,0.529892,0.000000,0.0,0.318841,0.356913,0.026329,0.003062,...,0,2,0,3,1,2,0,2,0,1
18021,0.111111,0.111324,0.559160,0.528242,0.000000,0.0,0.543478,0.571275,0.026488,0.000827,...,2,3,3,3,1,3,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9033,0.000000,0.062293,0.518727,0.807608,0.000000,0.0,0.449275,0.499464,0.028723,0.001049,...,0,3,1,2,2,1,3,1,2,1
64797,0.222222,0.118566,0.547002,0.663218,0.000000,0.0,0.681159,0.713826,0.026371,0.000919,...,1,2,3,0,1,3,1,1,1,1
23162,0.222222,0.103036,0.423132,0.547123,0.000000,0.0,0.478261,0.499464,0.026551,0.000864,...,0,2,3,3,1,1,1,1,1,1
15287,0.222222,0.174926,0.307641,0.806774,0.533333,0.0,0.442029,0.499464,0.029312,0.001007,...,1,3,1,2,2,1,3,1,2,0


In [195]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 300
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_test_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

ValueError: Length of values (16450) does not match length of index (13684)

# Fifth Hundred

In [78]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in fifth_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_test_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [79]:
number_of_centroids = [4,4,5,3,4,4,4,4,4,4,4,3,3,3,3,3,3,5,3,4,4,4,4,5,3,4,4,3,5,3,3,4,4,3,4,5,3,5,3,3,3,3,3,3,3,3,5,3,4,4,4,4,4,5,3,3,4,2,3,4,3,2,3,3,5,3,3,2,5,3,3,5,3,5,3,2,3,3,3,3,3,5,3,5,3,3,4,5,3,3,3,5,5,3,3,3,5,3,3,5]

len(number_of_centroids)

100

In [80]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 400
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_test_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Last 60

In [81]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in last_sixty:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_test_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [82]:
number_of_centroids = [3,5,3,3,4,4,4,4,5,3,4,4,4,5,3,4,4,5,3,4,5,3,5,3,3,3,3,3,4,3,3,3,5,3,4,4,3,5,3,3,4,4,5,3,4,5,3,5,3,3,3,5,3,5,3,3,5,3,3,3]

len(number_of_centroids)

60

In [83]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 500
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_test_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Now do for validate
# First Hundred

In [64]:
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in first_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# Running this cell took 11 minutes and 55.51 seconds. CPU % ~ 730 for this cell.
# plot delta(inertia) graphs for each group of clusters. Might want to ask Ryan if it's possible to cache 
# the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [65]:
number_of_centroids = [4,3,5,4,3,4,4,4,3,3,4,3,5,3,3,5,3,4,4,3,4,4,4,4,4,5,3,3,3,4,4,4,3,3,4,4,3,5,4,2,4,3,4,5,3,3,4,4,5,3,3,4,3,3,4,4,4,3,5,3,2,3,4,4,4,4,4,5,3,3,2,2,4,3,3,4,3,3,4,4,3,3,5,3,4,4,4,4,5,3,4,4,4,5,3,4,4,5,3,3]

len(number_of_centroids)

100

In [66]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 0
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Second Hundred

In [67]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in second_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [68]:
number_of_centroids = [5,3,5,3,3,3,3,3,4,4,4,4,3,4,4,3,5,3,4,3,3,4,4,3,3,3,3,4,5,4,3,3,5,3,4,4,3,3,3,4,3,3,4,3,3,4,4,3,3,4,3,4,4,4,4,4,4,4,5,3,3,4,4,4,4,4,5,3,4,4,4,4,4,5,3,2,4,4,4,5,3,3,2,3,3,3,4,4,5,3,3,4,3,4,3,3,3,3,3,4]

len(number_of_centroids)

100

In [69]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 100
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [70]:
X_validate_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_190,cluster_191,cluster_192,cluster_193,cluster_194,cluster_195,cluster_196,cluster_197,cluster_198,cluster_199
68142,0.111111,0.076601,0.666255,0.549087,0.000000,0.0,0.405797,0.428725,0.027236,0.000833,...,2,0,1,0,0,1,1,1,1,2
76543,0.333333,0.206770,0.565016,0.695367,0.000000,0.0,0.492754,0.499464,0.027081,0.001299,...,2,2,2,3,1,2,1,2,2,1
54903,0.222222,0.109318,0.733715,0.479214,0.000000,0.0,0.224638,0.285102,0.031625,0.000608,...,2,0,2,0,1,0,1,1,1,2
38106,0.111111,0.111499,0.617138,0.524496,0.000000,0.0,0.268116,0.285102,0.026764,0.000849,...,0,2,1,0,0,1,1,1,1,2
18005,0.111111,0.092654,0.332643,0.565829,0.000000,0.0,0.297101,0.356913,0.026401,0.002119,...,0,0,2,0,2,0,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6182,0.055556,0.063078,0.552977,0.145526,0.266667,0.0,0.304348,0.356913,0.025961,0.000216,...,2,2,0,3,1,0,2,1,1,2
40343,0.000000,0.039958,0.481820,0.631453,0.000000,0.0,0.065217,0.070740,0.027811,0.005877,...,0,0,1,0,0,1,1,2,1,2
74104,0.222222,0.154947,0.452482,0.631855,0.000000,0.0,0.768116,0.785638,0.027346,0.000912,...,1,2,1,0,0,1,1,2,2,1
30559,0.000000,0.084802,0.347794,0.684135,0.000000,0.0,0.442029,0.499464,0.036411,0.000698,...,1,1,0,1,2,0,1,2,2,1


# Third Hundred

In [71]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in third_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [72]:
number_of_centroids = [4,3,3,3,3,3,5,4,2,3,4,3,4,4,4,4,3,4,3,4,4,3,3,4,4,4,4,5,3,3,4,3,2,3,3,3,4,3,3,3,3,3,3,3,4,3,4,3,4,4,4,5,3,3,3,3,4,4,3,3,3,4,4,3,3,3,4,3,3,5,3,5,3,3,3,4,4,4,4,3,3,4,3,4,4,3,3,4,3,3,3,4,4,4,4,4,4,3,3,3]

len(number_of_centroids)

100

In [73]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 200
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [74]:
X_validate_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,...,cluster_290,cluster_291,cluster_292,cluster_293,cluster_294,cluster_295,cluster_296,cluster_297,cluster_298,cluster_299
68142,0.111111,0.076601,0.666255,0.549087,0.000000,0.0,0.405797,0.428725,0.027236,0.000833,...,0,3,3,1,0,3,3,0,2,2
76543,0.333333,0.206770,0.565016,0.695367,0.000000,0.0,0.492754,0.499464,0.027081,0.001299,...,1,2,3,2,1,3,3,0,2,0
54903,0.222222,0.109318,0.733715,0.479214,0.000000,0.0,0.224638,0.285102,0.031625,0.000608,...,0,3,3,1,1,0,1,2,0,1
38106,0.111111,0.111499,0.617138,0.524496,0.000000,0.0,0.268116,0.285102,0.026764,0.000849,...,2,3,0,1,0,0,1,2,0,1
18005,0.111111,0.092654,0.332643,0.565829,0.000000,0.0,0.297101,0.356913,0.026401,0.002119,...,0,3,0,1,1,0,1,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6182,0.055556,0.063078,0.552977,0.145526,0.266667,0.0,0.304348,0.356913,0.025961,0.000216,...,0,1,2,3,2,0,2,2,0,1
40343,0.000000,0.039958,0.481820,0.631453,0.000000,0.0,0.065217,0.070740,0.027811,0.005877,...,0,3,0,2,0,2,1,1,0,2
74104,0.222222,0.154947,0.452482,0.631855,0.000000,0.0,0.768116,0.785638,0.027346,0.000912,...,2,0,3,2,0,1,3,0,2,2
30559,0.000000,0.084802,0.347794,0.684135,0.000000,0.0,0.442029,0.499464,0.036411,0.000698,...,0,0,0,2,1,3,3,0,2,2


# Fourth Hundred

In [75]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in fourth_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [76]:
number_of_centroids = [3,3,4,4,4,4,4,3,3,4,3,4,4,4,4,4,4,4,4,3,3,3,3,4,4,3,3,3,4,4,3,3,5,4,3,5,4,4,4,3,3,4,3,2,2,3,3,2,4,3,4,3,3,4,4,3,3,5,3,3,5,4,4,5,5,5,3,3,3,3,3,4,4,3,3,3,4,4,4,3,3,4,3,4,3,3,3,4,3,3,4,3,3,3,3,4,3,3,4,4]

len(number_of_centroids)

100

In [77]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 300
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Fifth Hundred

In [78]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in fifth_hundred:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [79]:
number_of_centroids = [4,4,5,3,4,4,4,4,4,4,4,3,3,3,3,3,3,5,3,4,4,4,4,5,3,4,4,3,5,3,3,4,4,3,4,5,3,5,3,3,3,3,3,3,3,3,5,3,4,4,4,4,4,5,3,3,4,2,3,4,3,2,3,3,5,3,3,2,5,3,3,5,3,5,3,2,3,3,3,3,3,5,3,5,3,3,4,5,3,3,3,5,5,3,3,3,5,3,3,5]

len(number_of_centroids)

100

In [80]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 400
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

# Last 60

In [81]:
# Running this cell took 10 minutes and 03.95 seconds. CPU % ~ 730 for this cell.
# Create a list of the 100 features.
list_of_hundred_features = []
for combo in last_sixty:
    list_of_hundred_features.append(list(combo))
# Convert into dataframes.
list_of_dataframes = []
for combo in list_of_hundred_features:
    list_of_dataframes.append(X_validate_scaled[combo])

In [None]:
# plot delta(inertia) graphs for each group of clusters. Takes ~ 42.37 seconds to run on 10 clusters. Might want to ask Ryan if it's possible to cache the information that is returned.
# I'm guessing 100 dataframes will take ten times as long? Let's see: 9 minutes and 10 seconds  & CPU % up to 675
for combo in list_of_dataframes:
    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(combo).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia')
        plt.title('Change in inertia as k increases')

In [82]:
number_of_centroids = [3,5,3,3,4,4,4,4,5,3,4,4,4,5,3,4,4,5,3,4,5,3,5,3,3,3,3,3,4,3,3,3,5,3,4,4,3,5,3,3,4,4,5,3,4,5,3,5,3,3,3,5,3,5,3,3,5,3,3,3]

len(number_of_centroids)

60

In [83]:
# This cell will iterate through the list of dataframes and fit the kmeans with the specified number of centroids extracted from the [{inertia vs k}] graphs above.

# This cell takes ~ 30 seconds  to render with 100 clusters.

# set i to be thea iteration count for number_of_centroids index
# set n to be the iteration count for list_of_dataframes index
i = 0
k = 0
s = 500
list_of_cluster_predictions = []
for combo in list_of_dataframes: 
    kmeans = KMeans(n_clusters = number_of_centroids[k])
    kmeans.fit(list_of_dataframes[i])
    X_validate_scaled['cluster_' + str(s)] = kmeans.predict(combo)
    list_of_cluster_predictions.append(kmeans.predict(list_of_dataframes[i]))
    i += 1
    k += 1
    s += 1

In [84]:
X_train_scaled, X_validate_scaled

(       bathroomcnt  calculatedfinishedsquarefeet  latitude  longitude  \
 18253     0.333333                      0.313558  0.813292   0.590575   
 29637     0.166667                      0.129384  0.637874   0.374641   
 66783     0.222222                      0.175711  0.360530   0.571078   
 58524     0.111111                      0.121183  0.836911   0.743402   
 3965      0.166667                      0.107311  0.186172   0.965465   
 ...            ...                           ...       ...        ...   
 29239     0.055556                      0.226226  0.623984   0.214175   
 16805     0.111111                      0.107922  0.408241   0.620267   
 38915     0.111111                      0.151719  0.933938   0.684055   
 45130     0.111111                      0.069272  0.347121   0.614691   
 5668      0.222222                      0.117257  0.641252   0.449278   
 
         roomcnt  assessmentyear       age   age_bin   taxrate     acres  ...  \
 18253  0.000000             

In [86]:
y_train

Unnamed: 0,logerror,logerror_pred_mean,logerror_pred_median,logerror_pred_lm,logerror_pred_lars
18253,-0.241830,0.017837,0.007131,0.032319,0.017837
29637,0.116972,0.017837,0.007131,0.015009,0.017837
66783,-0.021770,0.017837,0.007131,0.031397,0.017837
58524,-0.036804,0.017837,0.007131,0.015402,0.017837
3965,-0.022598,0.017837,0.007131,0.010482,0.017837
...,...,...,...,...,...
29239,-0.000416,0.017837,0.007131,0.022704,0.017837
16805,-0.012333,0.017837,0.007131,0.019006,0.017837
38915,-0.065222,0.017837,0.007131,0.024489,0.017837
45130,0.070491,0.017837,0.007131,0.016449,0.017837


# Time to fit the model on my data to get the top performing features!

In [87]:
## Running this cell with this much data processing takes about 3 minutes.
# Initialize the linear regression object!
lm = LinearRegression()
# Initialize the RFE object, setting the hyperparameters to be our linear regression object created above
# (as the algorithm to test the features on) and the number of features to return to be 5.
rfe = RFE(lm, n_features_to_select=100)
# Fit the RFE object to our data. This means create multiple linear regression models, find the one that 
# performs best, and identify the features that are used in that model. Those are the features we want.
# Transform our X dataframe to include only those 2 features. .transform() or do both of those steps together 
# with .fit_transform()
# Transforming data using RFE.
X_rfe = rfe.fit_transform(X_train_scaled,y_train)  
#Fitting the model onto new clusters.
lm.fit(X_rfe,y_train)

# Create boolean mask.
mask = rfe.support_

# Store top 100 features into this variable.
rfe_features = X_train_scaled.loc[:, mask].columns.tolist()

In [88]:
rfe_features

['calculatedfinishedsquarefeet',
 'assessmentyear',
 'taxrate',
 'acres',
 'bath_bed_ratio',
 'cola',
 'cluster_58',
 'cluster_59',
 'cluster_82',
 'cluster_83',
 'cluster_89',
 'cluster_94',
 'cluster_103',
 'cluster_130',
 'cluster_136',
 'cluster_146',
 'cluster_170',
 'cluster_176',
 'cluster_185',
 'cluster_188',
 'cluster_192',
 'cluster_277',
 'cluster_278',
 'cluster_342',
 'cluster_343',
 'cluster_345',
 'cluster_346',
 'cluster_348',
 'cluster_349',
 'cluster_358',
 'cluster_366',
 'cluster_367',
 'cluster_371',
 'cluster_372',
 'cluster_373',
 'cluster_375',
 'cluster_379',
 'cluster_384',
 'cluster_387',
 'cluster_391',
 'cluster_392',
 'cluster_394',
 'cluster_395',
 'cluster_404',
 'cluster_405',
 'cluster_409',
 'cluster_413',
 'cluster_414',
 'cluster_416',
 'cluster_419',
 'cluster_420',
 'cluster_422',
 'cluster_424',
 'cluster_426',
 'cluster_428',
 'cluster_429',
 'cluster_433',
 'cluster_435',
 'cluster_436',
 'cluster_438',
 'cluster_440',
 'cluster_441',
 'cluste

In [89]:
y_train

Unnamed: 0,logerror,logerror_pred_mean,logerror_pred_median,logerror_pred_lm,logerror_pred_lars
18253,-0.241830,0.017837,0.007131,0.032319,0.017837
29637,0.116972,0.017837,0.007131,0.015009,0.017837
66783,-0.021770,0.017837,0.007131,0.031397,0.017837
58524,-0.036804,0.017837,0.007131,0.015402,0.017837
3965,-0.022598,0.017837,0.007131,0.010482,0.017837
...,...,...,...,...,...
29239,-0.000416,0.017837,0.007131,0.022704,0.017837
16805,-0.012333,0.017837,0.007131,0.019006,0.017837
38915,-0.065222,0.017837,0.007131,0.024489,0.017837
45130,0.070491,0.017837,0.007131,0.016449,0.017837


In [92]:
# Check out the top ten performing features.
f_selector = SelectKBest(f_regression, k=10)

f_selector.fit(X_train_scaled, y_train.logerror)

X_reduced = f_selector.transform(X_train_scaled)

f_support = f_selector.get_support()

f_feature = X_train_scaled.loc[:,f_support].columns.tolist()

# you could also get the list this way (among many others)
# f_feature = [X_train_scaled.columns.values[i] for i in range(len(feature_mask)) if feature_mask[i]==True]

print(str(len(f_feature)), 'selected features')
print(f_feature)

(38285, 576)
(38285, 10)
(38285, 2)
10 selected features
['calculatedfinishedsquarefeet', 'cluster_95', 'cluster_123', 'cluster_186', 'cluster_317', 'cluster_408', 'cluster_451', 'cluster_457', 'cluster_499', 'cluster_516']


In [169]:
# Choose 5 of the best clusters. If this is over fit use less.
best_feats = f_feature
best_feats

['calculatedfinishedsquarefeet',
 'cluster_95',
 'cluster_123',
 'cluster_186',
 'cluster_317',
 'cluster_408',
 'cluster_451',
 'cluster_457',
 'cluster_499',
 'cluster_516']

In [164]:
train_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,logerror,age,age_bin,taxrate,acres,acres_bin,sqft_bin,structure_dollar_sqft_bin,lot_dollar_sqft_bin,bath_bed_ratio,cola,bathroomcnt.1,calculatedfinishedsquarefeet.1,latitude.1,longitude.1,roomcnt.1,assessmentyear.1,age.1,age_bin.1,taxrate.1,acres.1,acres_bin.1,sqft_bin.1,structure_dollar_sqft_bin.1,lot_dollar_sqft_bin.1,bath_bed_ratio.1,cola.1,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23,cluster_24,cluster_25,cluster_26,cluster_27,cluster_28,cluster_29,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39,cluster_40,cluster_41,cluster_42,cluster_43,cluster_44,cluster_45,cluster_46,cluster_47,cluster_48,cluster_49,cluster_50,cluster_51,cluster_52,cluster_53,cluster_54,cluster_55,cluster_56,cluster_57,cluster_58,cluster_59,cluster_60,cluster_61,cluster_62,cluster_63,cluster_64,cluster_65,cluster_66,cluster_67,cluster_68,cluster_69,cluster_70,cluster_71,cluster_72,cluster_73,cluster_74,cluster_75,cluster_76,cluster_77,cluster_78,cluster_79,cluster_80,cluster_81,cluster_82,cluster_83,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,cluster_92,cluster_93,cluster_94,cluster_95,cluster_96,cluster_97,cluster_98,cluster_99,cluster_100,cluster_101,cluster_102,cluster_103,cluster_104,cluster_105,cluster_106,cluster_107,cluster_108,cluster_109,cluster_110,cluster_111,cluster_112,cluster_113,cluster_114,cluster_115,cluster_116,cluster_117,cluster_118,cluster_119,cluster_120,cluster_121,cluster_122,cluster_123,cluster_124,cluster_125,cluster_126,cluster_127,cluster_128,cluster_129,cluster_130,cluster_131,cluster_132,cluster_133,cluster_134,cluster_135,cluster_136,cluster_137,cluster_138,cluster_139,cluster_140,cluster_141,cluster_142,cluster_143,cluster_144,cluster_145,cluster_146,cluster_147,cluster_148,cluster_149,cluster_150,cluster_151,cluster_152,cluster_153,cluster_154,cluster_155,cluster_156,cluster_157,cluster_158,cluster_159,cluster_160,cluster_161,cluster_162,cluster_163,cluster_164,cluster_165,cluster_166,cluster_167,cluster_168,cluster_169,cluster_170,cluster_171,cluster_172,cluster_173,cluster_174,cluster_175,cluster_176,cluster_177,cluster_178,cluster_179,cluster_180,cluster_181,cluster_182,cluster_183,cluster_184,cluster_185,cluster_186,cluster_187,cluster_188,cluster_189,cluster_190,cluster_191,cluster_192,cluster_193,cluster_194,cluster_195,cluster_196,cluster_197,cluster_198,cluster_199,cluster_200,cluster_201,cluster_202,cluster_203,cluster_204,cluster_205,cluster_206,cluster_207,cluster_208,cluster_209,cluster_210,cluster_211,cluster_212,cluster_213,cluster_214,cluster_215,cluster_216,cluster_217,cluster_218,cluster_219,cluster_220,cluster_221,cluster_222,cluster_223,cluster_224,cluster_225,cluster_226,cluster_227,cluster_228,cluster_229,cluster_230,cluster_231,cluster_232,cluster_233,cluster_234,cluster_235,cluster_236,cluster_237,cluster_238,cluster_239,cluster_240,cluster_241,cluster_242,cluster_243,cluster_244,cluster_245,cluster_246,cluster_247,cluster_248,cluster_249,cluster_250,cluster_251,cluster_252,cluster_253,cluster_254,cluster_255,cluster_256,cluster_257,cluster_258,cluster_259,cluster_260,cluster_261,cluster_262,cluster_263,cluster_264,cluster_265,cluster_266,cluster_267,cluster_268,cluster_269,cluster_270,cluster_271,cluster_272,cluster_273,cluster_274,cluster_275,cluster_276,cluster_277,cluster_278,cluster_279,cluster_280,cluster_281,cluster_282,cluster_283,cluster_284,cluster_285,cluster_286,cluster_287,cluster_288,cluster_289,cluster_290,cluster_291,cluster_292,cluster_293,cluster_294,cluster_295,cluster_296,cluster_297,cluster_298,cluster_299,cluster_300,cluster_301,cluster_302,cluster_303,cluster_304,cluster_305,cluster_306,cluster_307,cluster_308,cluster_309,cluster_310,cluster_311,cluster_312,cluster_313,cluster_314,cluster_315,cluster_316,cluster_317,cluster_318,cluster_319,cluster_320,cluster_321,cluster_322,cluster_323,cluster_324,cluster_325,cluster_326,cluster_327,cluster_328,cluster_329,cluster_330,cluster_331,cluster_332,cluster_333,cluster_334,cluster_335,cluster_336,cluster_337,cluster_338,cluster_339,cluster_340,cluster_341,cluster_342,cluster_343,cluster_344,cluster_345,cluster_346,cluster_347,cluster_348,cluster_349,cluster_350,cluster_351,cluster_352,cluster_353,cluster_354,cluster_355,cluster_356,cluster_357,cluster_358,cluster_359,cluster_360,cluster_361,cluster_362,cluster_363,cluster_364,cluster_365,cluster_366,cluster_367,cluster_368,cluster_369,cluster_370,cluster_371,cluster_372,cluster_373,cluster_374,cluster_375,cluster_376,cluster_377,cluster_378,cluster_379,cluster_380,cluster_381,cluster_382,cluster_383,cluster_384,cluster_385,cluster_386,cluster_387,cluster_388,cluster_389,cluster_390,cluster_391,cluster_392,cluster_393,cluster_394,cluster_395,cluster_396,cluster_397,cluster_398,cluster_399,cluster_400,cluster_401,cluster_402,cluster_403,cluster_404,cluster_405,cluster_406,cluster_407,cluster_408,cluster_409,cluster_410,cluster_411,cluster_412,cluster_413,cluster_414,cluster_415,cluster_416,cluster_417,cluster_418,cluster_419,cluster_420,cluster_421,cluster_422,cluster_423,cluster_424,cluster_425,cluster_426,cluster_427,cluster_428,cluster_429,cluster_430,cluster_431,cluster_432,cluster_433,cluster_434,cluster_435,cluster_436,cluster_437,cluster_438,cluster_439,cluster_440,cluster_441,cluster_442,cluster_443,cluster_444,cluster_445,cluster_446,cluster_447,cluster_448,cluster_449,cluster_450,cluster_451,cluster_452,cluster_453,cluster_454,cluster_455,cluster_456,cluster_457,cluster_458,cluster_459,cluster_460,cluster_461,cluster_462,cluster_463,cluster_464,cluster_465,cluster_466,cluster_467,cluster_468,cluster_469,cluster_470,cluster_471,cluster_472,cluster_473,cluster_474,cluster_475,cluster_476,cluster_477,cluster_478,cluster_479,cluster_480,cluster_481,cluster_482,cluster_483,cluster_484,cluster_485,cluster_486,cluster_487,cluster_488,cluster_489,cluster_490,cluster_491,cluster_492,cluster_493,cluster_494,cluster_495,cluster_496,cluster_497,cluster_498,cluster_499,cluster_500,cluster_501,cluster_502,cluster_503,cluster_504,cluster_505,cluster_506,cluster_507,cluster_508,cluster_509,cluster_510,cluster_511,cluster_512,cluster_513,cluster_514,cluster_515,cluster_516,cluster_517,cluster_518,cluster_519,cluster_520,cluster_521,cluster_522,cluster_523,cluster_524,cluster_525,cluster_526,cluster_527,cluster_528,cluster_529,cluster_530,cluster_531,cluster_532,cluster_533,cluster_534,cluster_535,cluster_536,cluster_537,cluster_538,cluster_539,cluster_540,cluster_541,cluster_542,cluster_543,cluster_544,cluster_545,cluster_546,cluster_547,cluster_548,cluster_549,cluster_550,cluster_551,cluster_552,cluster_553,cluster_554,cluster_555,cluster_556,cluster_557,cluster_558,cluster_559
2,0.111111,0.087332,0.375096,0.860526,0.400000,0.0,0.458924,0.391304,0.428725,0.025165,0.001176,0.222222,0.222222,0.222222,0.444444,0.097222,0.0,0.111111,0.087332,0.375096,0.860526,0.400000,0.0,0.391304,0.428725,0.025165,0.001176,0.222222,0.222222,0.222222,0.444444,0.097222,0.0,4.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,3.0,3.0,0.0,1.0,0.0,2.0,2.0,3.0,0.0,2.0,0.0,1.0,3.0,0.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,0.0,2.0,0.0,3.0,2.0,2.0,0.0,1.0,0.0,3.0,2.0,0.0,3.0,2.0,1.0,3.0,4.0,1.0,1.0,1.0,4.0,2.0,3.0,5.0,3.0,1.0,2.0,2.0,1.0,0.0,3.0,0.0,2.0,0.0,2.0,0.0,1.0,2.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,1.0,0.0,0.0,3.0,3.0,2.0,2.0,3.0,0.0,3.0,3.0,1.0,3.0,3.0,0.0,2.0,0.0,2.0,1.0,3.0,2.0,1.0,0.0,2.0,0.0,1.0,3.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,3.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,2.0,0.0,3.0,1.0,3.0,0.0,0.0,3.0,3.0,0.0,2.0,1.0,2.0,1.0,2.0,2.0,3.0,2.0,3.0,2.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0,2.0,1.0,0.0,1.0,3.0,0.0,2.0,2.0,1.0,0.0,2.0,2.0,0.0,0.0,3.0,2.0,0.0,2.0,3.0,3.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,3.0,2.0,0.0,3.0,1.0,3.0,1.0,0.0,2.0,3.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,3.0,2.0,2.0,2.0,2.0,0.0,1.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,2.0,1.0,3.0,2.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,3.0,3.0,3.0,0.0,0.0,3.0,1.0,3.0,1.0,2.0,3.0,0.0,0.0,2.0,3.0,2.0,2.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,3.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0,1.0,2.0,3.0,1.0,0.0,2.0,1.0,0.0,0.0,3.0,2.0,1.0,0.0,0.0,3.0,1.0,0.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,3.0,0.0,0.0,2.0,0.0,2.0,3.0,1.0,0.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,3.0,3.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,2.0,3.0,1.0,2.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,2.0,1.0,2.0,0.0,2.0,0.0,1.0,2.0,1.0,3.0,2.0,1.0,2.0,3.0,3.0,0.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,2.0,1.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,3.0,0.0,0.0,1.0,3.0,0.0,3.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,2.0,3.0,1.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,0.0,3.0,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,0.0,2.0,0.0,2.0,1.0,3.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,3.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,2.0,2.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,2.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,2.0,0.0,1.0,0.0,2.0,2.0,1.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,3.0,0.0,2.0,0.0,1.0,0.0,2.0,0.0,3.0,2.0,0.0,2.0,2.0,3.0
3,0.222222,0.186180,0.621444,0.643055,0.000000,0.0,0.447728,0.333333,0.356913,0.026840,0.001837,0.333333,0.555556,0.111111,0.111111,0.114583,0.0,0.222222,0.186180,0.621444,0.643055,0.000000,0.0,0.333333,0.356913,0.026840,0.001837,0.333333,0.555556,0.111111,0.111111,0.114583,0.0,3.0,2.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,2.0,4.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,0.0,1.0,3.0,3.0,3.0,2.0,0.0,3.0,0.0,2.0,1.0,2.0,1.0,3.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,2.0,3.0,3.0,1.0,0.0,3.0,0.0,1.0,2.0,0.0,1.0,3.0,2.0,3.0,1.0,1.0,3.0,2.0,4.0,4.0,3.0,1.0,1.0,3.0,0.0,4.0,3.0,0.0,3.0,2.0,1.0,0.0,1.0,3.0,1.0,3.0,1.0,2.0,1.0,1.0,2.0,0.0,3.0,0.0,1.0,0.0,3.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,3.0,3.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,3.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,2.0,1.0,0.0,1.0,1.0,2.0,2.0,1.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,2.0,2.0,1.0,2.0,0.0,1.0,2.0,2.0,2.0,1.0,3.0,1.0,0.0,1.0,3.0,2.0,2.0,0.0,1.0,2.0,3.0,3.0,1.0,0.0,2.0,1.0,2.0,3.0,1.0,0.0,2.0,3.0,2.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,1.0,2.0,0.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,2.0,3.0,3.0,1.0,4.0,3.0,1.0,2.0,2.0,3.0,2.0,3.0,2.0,3.0,1.0,2.0,0.0,0.0,1.0,3.0,3.0,3.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,3.0,3.0,2.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,0.0,1.0,3.0,0.0,3.0,2.0,3.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,2.0,3.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,3.0,1.0,0.0,3.0,0.0,3.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,2.0,1.0,0.0,3.0,0.0,1.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,3.0,3.0,1.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,1.0,2.0,0.0,1.0,2.0,0.0,1.0,0.0,3.0,1.0,2.0,2.0,2.0,0.0,1.0,2.0,0.0,0.0,3.0,1.0,2.0,2.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,1.0,2.0,1.0,2.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,2.0,3.0,2.0,3.0,3.0,2.0,1.0,2.0,2.0,0.0,3.0,1.0,0.0,1.0,1.0,3.0,0.0,1.0,2.0,0.0,1.0,1.0,3.0,2.0,0.0,3.0,0.0,1.0,3.0,2.0,3.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,2.0,2.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0,2.0,0.0,1.0,2.0,3.0,0.0,1.0,2.0,2.0,1.0,3.0,1.0,2.0,0.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,1.0,1.0,0.0,2.0,2.0,2.0,0.0,3.0,0.0,0.0,2.0,0.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,2.0,1.0,2.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,1.0,2.0,2.0,1.0,1.0,0.0,2.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,0.0,2.0,1.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,2.0,1.0
7,0.000000,0.043273,0.555594,0.643766,0.000000,0.0,0.468839,0.681159,0.713826,0.023703,0.000571,0.000000,0.000000,0.111111,0.333333,0.062500,0.0,0.000000,0.043273,0.555594,0.643766,0.000000,0.0,0.681159,0.713826,0.023703,0.000571,0.000000,0.000000,0.111111,0.333333,0.062500,0.0,3.0,2.0,2.0,3.0,3.0,4.0,0.0,1.0,2.0,3.0,4.0,2.0,3.0,2.0,1.0,2.0,3.0,2.0,0.0,3.0,2.0,0.0,2.0,1.0,0.0,0.0,3.0,2.0,3.0,2.0,1.0,1.0,3.0,0.0,0.0,3.0,3.0,0.0,2.0,0.0,5.0,3.0,2.0,1.0,4.0,0.0,1.0,3.0,1.0,3.0,2.0,1.0,1.0,0.0,4.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,1.0,3.0,2.0,1.0,2.0,0.0,3.0,1.0,2.0,0.0,1.0,3.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,2.0,3.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0,2.0,1.0,0.0,1.0,1.0,3.0,0.0,2.0,3.0,1.0,0.0,0.0,1.0,3.0,1.0,3.0,1.0,0.0,2.0,3.0,1.0,0.0,0.0,2.0,1.0,2.0,2.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,1.0,3.0,0.0,1.0,2.0,0.0,3.0,3.0,0.0,0.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,2.0,1.0,1.0,2.0,2.0,3.0,1.0,3.0,0.0,2.0,3.0,0.0,2.0,0.0,1.0,4.0,2.0,0.0,1.0,3.0,0.0,1.0,0.0,3.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,0.0,2.0,0.0,3.0,3.0,0.0,2.0,3.0,1.0,2.0,2.0,1.0,0.0,2.0,2.0,2.0,2.0,1.0,0.0,3.0,3.0,1.0,0.0,3.0,3.0,1.0,2.0,0.0,1.0,2.0,0.0,2.0,2.0,1.0,2.0,2.0,3.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,3.0,0.0,0.0,1.0,2.0,1.0,3.0,2.0,2.0,1.0,1.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,1.0,0.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0,3.0,0.0,1.0,2.0,3.0,1.0,1.0,0.0,3.0,2.0,1.0,0.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,3.0,3.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,3.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,2.0,2.0,0.0,2.0,0.0,3.0,3.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,3.0,1.0,1.0,1.0,3.0,0.0,1.0,2.0,0.0,1.0,2.0,3.0,2.0,3.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,2.0,1.0,3.0,0.0,0.0,3.0,1.0,0.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,2.0,1.0,2.0,3.0,1.0,1.0,2.0,0.0,2.0,1.0,1.0,2.0,1.0,1.0,3.0,2.0,1.0,0.0,2.0,2.0,3.0,0.0,0.0,3.0,3.0,0.0,2.0,0.0,0.0,3.0,0.0,2.0,3.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,2.0,1.0,2.0,0.0,3.0,2.0,2.0,0.0,1.0,0.0,0.0,3.0,3.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,3.0,2.0,2.0,2.0,1.0,3.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,2.0,2.0,3.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,2.0,2.0,3.0,2.0,1.0,1.0,2.0,2.0,2.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,2.0,3.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,3.0,0.0,2.0,0.0,1.0,2.0,2.0,2.0,3.0,2.0,2.0,0.0,2.0,3.0
12,0.166667,0.095533,0.222442,0.769991,0.000000,0.0,0.455402,0.195652,0.214362,0.025122,0.001401,0.222222,0.333333,0.333333,0.333333,0.131944,0.0,0.166667,0.095533,0.222442,0.769991,0.000000,0.0,0.195652,0.214362,0.025122,0.001401,0.222222,0.333333,0.333333,0.333333,0.131944,0.0,1.0,1.0,2.0,1.0,2.0,3.0,2.0,2.0,3.0,0.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0,0.0,1.0,1.0,3.0,3.0,3.0,3.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,1.0,2.0,2.0,0.0,2.0,2.0,0.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0,1.0,0.0,2.0,1.0,0.0,3.0,0.0,0.0,2.0,0.0,1.0,4.0,0.0,2.0,1.0,3.0,2.0,0.0,0.0,3.0,1.0,1.0,2.0,0.0,1.0,1.0,3.0,0.0,3.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,3.0,1.0,0.0,2.0,1.0,0.0,2.0,1.0,3.0,1.0,0.0,2.0,2.0,0.0,1.0,1.0,0.0,2.0,2.0,2.0,2.0,0.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0.0,0.0,1.0,1.0,3.0,1.0,3.0,2.0,1.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,3.0,1.0,2.0,1.0,3.0,3.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,0.0,2.0,2.0,2.0,2.0,3.0,3.0,0.0,2.0,1.0,3.0,3.0,1.0,4.0,1.0,3.0,2.0,0.0,3.0,3.0,3.0,1.0,0.0,0.0,1.0,2.0,2.0,1.0,0.0,2.0,3.0,1.0,1.0,2.0,0.0,1.0,3.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,0.0,0.0,1.0,3.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,2.0,0.0,0.0,2.0,3.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,2.0,3.0,3.0,1.0,2.0,0.0,2.0,1.0,1.0,0.0,2.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,0.0,3.0,1.0,2.0,1.0,1.0,2.0,1.0,0.0,2.0,1.0,3.0,0.0,0.0,2.0,0.0,1.0,3.0,1.0,3.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,3.0,1.0,2.0,0.0,1.0,3.0,0.0,2.0,3.0,2.0,2.0,0.0,0.0,2.0,0.0,1.0,2.0,1.0,2.0,2.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,2.0,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,3.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,3.0,1.0,0.0,1.0,3.0,3.0,1.0,3.0,1.0,1.0,0.0,2.0,3.0,0.0,1.0,3.0,0.0,0.0,3.0,3.0,0.0,3.0,2.0,2.0,0.0,0.0,2.0,1.0,2.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,0.0,3.0,1.0,2.0,1.0,0.0,3.0,3.0,0.0,0.0,1.0,1.0,3.0,0.0,0.0,1.0,2.0,0.0,1.0,1.0,2.0,1.0,2.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,3.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,1.0,3.0,0.0,2.0,1.0,1.0,3.0,1.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,0.0,2.0,1.0,2.0,0.0,3.0,2.0,0.0,2.0,1.0,3.0
13,0.166667,0.095795,0.296370,0.777369,0.400000,0.0,0.463063,0.260870,0.285102,0.027968,0.000138,0.000000,0.333333,0.222222,0.555556,0.131944,0.0,0.166667,0.095795,0.296370,0.777369,0.400000,0.0,0.260870,0.285102,0.027968,0.000138,0.000000,0.333333,0.222222,0.555556,0.131944,0.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,0.0,0.0,1.0,0.0,2.0,2.0,3.0,0.0,0.0,1.0,1.0,3.0,3.0,3.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,3.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,0.0,2.0,1.0,4.0,2.0,1.0,4.0,3.0,0.0,3.0,2.0,0.0,2.0,4.0,2.0,2.0,1.0,1.0,2.0,0.0,4.0,3.0,1.0,4.0,2.0,0.0,1.0,1.0,3.0,0.0,1.0,3.0,2.0,0.0,2.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0,0.0,0.0,3.0,3.0,2.0,2.0,1.0,0.0,3.0,3.0,3.0,1.0,3.0,0.0,2.0,0.0,2.0,1.0,3.0,2.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,3.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,1.0,1.0,3.0,1.0,0.0,2.0,0.0,3.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0,3.0,1.0,2.0,1.0,3.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,1.0,0.0,3.0,0.0,0.0,2.0,2.0,1.0,0.0,2.0,0.0,2.0,0.0,1.0,2.0,2.0,2.0,3.0,3.0,2.0,0.0,1.0,3.0,1.0,1.0,0.0,1.0,3.0,2.0,0.0,3.0,1.0,3.0,1.0,0.0,2.0,3.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,3.0,0.0,1.0,3.0,2.0,2.0,2.0,0.0,0.0,1.0,3.0,2.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,3.0,0.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,1.0,3.0,1.0,2.0,2.0,0.0,0.0,2.0,3.0,2.0,2.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,3.0,1.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,2.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,0.0,3.0,1.0,2.0,1.0,1.0,2.0,1.0,0.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,3.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,3.0,0.0,1.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,3.0,3.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,2.0,3.0,1.0,2.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,2.0,1.0,2.0,0.0,2.0,0.0,1.0,2.0,1.0,3.0,2.0,1.0,2.0,3.0,0.0,2.0,3.0,0.0,0.0,2.0,0.0,3.0,2.0,1.0,2.0,0.0,1.0,3.0,3.0,1.0,3.0,1.0,2.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,3.0,3.0,3.0,2.0,0.0,3.0,1.0,2.0,2.0,3.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,1.0,2.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,2.0,2.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,3.0,1.0,0.0,0.0,1.0,1.0,3.0,2.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,3.0,2.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,2.0,2.0,1.0,0.0,2.0,0.0,1.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,3.0,0.0,2.0,0.0,1.0,0.0,2.0,0.0,3.0,2.0,0.0,2.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77567,0.111111,0.115861,0.586972,0.440718,0.000000,0.0,0.466726,0.420290,0.428725,0.030477,0.001042,0.222222,0.444444,0.111111,0.111111,0.097222,1.0,0.111111,0.115861,0.586972,0.440718,0.000000,0.0,0.420290,0.428725,0.030477,0.001042,0.222222,0.444444,0.111111,0.111111,0.097222,1.0,3.0,0.0,2.0,1.0,1.0,2.0,2.0,2.0,3.0,0.0,4.0,0.0,0.0,1.0,3.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,0.0,1.0,2.0,0.0,1.0,2.0,3.0,0.0,2.0,1.0,3.0,1.0,2.0,3.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,3.0,1.0,0.0,1.0,4.0,0.0,3.0,2.0,2.0,0.0,3.0,3.0,1.0,2.0,1.0,3.0,2.0,1.0,3.0,3.0,1.0,0.0,1.0,3.0,2.0,3.0,5.0,0.0,1.0,3.0,2.0,0.0,0.0,3.0,3.0,2.0,1.0,2.0,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,3.0,3.0,2.0,2.0,0.0,0.0,2.0,0.0,1.0,3.0,1.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,2.0,1.0,2.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,1.0,3.0,2.0,0.0,1.0,2.0,0.0,3.0,1.0,3.0,2.0,0.0,0.0,3.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,1.0,2.0,1.0,2.0,1.0,2.0,2.0,0.0,3.0,1.0,1.0,3.0,1.0,0.0,3.0,1.0,2.0,0.0,3.0,2.0,3.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,3.0,2.0,2.0,2.0,2.0,0.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,3.0,2.0,3.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,3.0,3.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0,1.0,3.0,2.0,1.0,0.0,2.0,0.0,1.0,0.0,3.0,1.0,1.0,0.0,3.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,2.0,3.0,0.0,1.0,1.0,3.0,0.0,0.0,1.0,2.0,1.0,3.0,1.0,2.0,0.0,0.0,0.0,3.0,3.0,2.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0,3.0,2.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,3.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0,2.0,0.0,1.0,1.0,1.0,2.0,0.0,3.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,3.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,3.0,0.0,0.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,0.0,1.0,1.0,3.0,0.0,2.0,2.0,3.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,2.0,1.0,1.0,0.0,1.0,2.0,0.0,2.0,3.0,3.0,2.0,0.0,3.0,1.0,1.0,3.0,2.0,0.0,1.0,3.0,0.0,2.0,3.0,0.0,1.0,0.0,2.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,0.0,2.0,0.0,2.0,2.0,1.0,2.0,0.0,2.0,1.0,3.0,3.0,3.0,2.0,1.0,3.0,2.0,1.0,3.0,3.0,1.0,0.0,0.0,1.0,2.0,1.0,2.0,0.0,1.0,0.0,0.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,2.0,1.0,1.0,2.0,0.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,0.0,0.0,2.0,2.0,0.0,1.0,2.0,3.0,1.0,1.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,3.0,1.0,1.0,2.0,1.0,0.0,2.0
77568,0.111111,0.121358,0.364561,0.731524,0.000000,0.0,0.458481,0.391304,0.428725,0.026402,0.000630,0.111111,0.444444,0.333333,0.333333,0.062500,0.0,0.111111,0.121358,0.364561,0.731524,0.000000,0.0,0.391304,0.428725,0.026402,0.000630,0.111111,0.444444,0.333333,0.333333,0.062500,0.0,4.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,3.0,3.0,1.0,2.0,2.0,0.0,1.0,2.0,1.0,3.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,2.0,1.0,4.0,3.0,3.0,3.0,1.0,3.0,0.0,3.0,2.0,2.0,4.0,3.0,0.0,0.0,2.0,0.0,3.0,2.0,1.0,3.0,3.0,2.0,1.0,1.0,4.0,2.0,3.0,5.0,0.0,2.0,2.0,2.0,1.0,0.0,1.0,3.0,0.0,0.0,2.0,0.0,1.0,0.0,2.0,2.0,1.0,1.0,0.0,2.0,2.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,3.0,3.0,0.0,2.0,3.0,1.0,2.0,3.0,3.0,1.0,3.0,0.0,2.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,3.0,0.0,2.0,1.0,2.0,1.0,1.0,0.0,0.0,2.0,3.0,2.0,0.0,0.0,1.0,3.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,2.0,1.0,3.0,0.0,0.0,2.0,0.0,2.0,2.0,2.0,2.0,3.0,3.0,0.0,2.0,1.0,0.0,3.0,2.0,0.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,3.0,3.0,0.0,2.0,0.0,2.0,1.0,0.0,1.0,3.0,1.0,1.0,2.0,0.0,1.0,1.0,3.0,0.0,1.0,1.0,2.0,2.0,2.0,2.0,0.0,1.0,3.0,2.0,1.0,3.0,1.0,0.0,0.0,1.0,0.0,1.0,3.0,2.0,1.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,3.0,3.0,3.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,3.0,3.0,1.0,2.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,0.0,1.0,3.0,0.0,2.0,2.0,2.0,1.0,1.0,0.0,2.0,0.0,3.0,0.0,0.0,2.0,0.0,1.0,3.0,1.0,3.0,3.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,2.0,1.0,2.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,0.0,2.0,3.0,0.0,2.0,2.0,3.0,2.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,3.0,0.0,0.0,1.0,1.0,2.0,0.0,2.0,3.0,2.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,2.0,1.0,2.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,2.0,2.0,1.0,2.0,2.0,0.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,2.0,2.0,1.0,3.0,0.0,2.0,0.0,2.0,1.0,2.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,3.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,1.0,2.0,2.0,0.0,1.0,1.0,3.0,2.0,2.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0,3.0
77569,0.222222,0.130780,0.592089,0.506976,0.000000,0.0,0.458139,0.260870,0.285102,0.027096,0.008500,0.555556,0.444444,0.555556,0.111111,0.166667,1.0,0.222222,0.130780,0.592089,0.506976,0.000000,0.0,0.260870,0.285102,0.027096,0.008500,0.555556,0.444444,0.555556,0.111111,0.166667,1.0,3.0,0.0,0.0,0.0,2.0,3.0,1.0,0.0,1.0,0.0,2.0,0.0,4.0,3.0,3.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,3.0,1.0,2.0,0.0,3.0,2.0,2.0,1.0,2.0,2.0,2.0,0.0,1.0,1.0,2.0,2.0,2.0,0.0,2.0,2.0,3.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,1.0,2.0,3.0,1.0,1.0,1.0,2.0,2.0,4.0,4.0,1.0,1.0,0.0,3.0,0.0,1.0,3.0,1.0,2.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,3.0,3.0,0.0,2.0,0.0,0.0,2.0,0.0,1.0,3.0,0.0,2.0,0.0,2.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,0.0,3.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,2.0,3.0,2.0,0.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,3.0,2.0,0.0,2.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,3.0,1.0,2.0,2.0,2.0,0.0,1.0,1.0,4.0,3.0,1.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,2.0,2.0,3.0,1.0,2.0,3.0,2.0,0.0,1.0,3.0,0.0,0.0,0.0,2.0,3.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,3.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,3.0,1.0,1.0,3.0,0.0,0.0,0.0,3.0,1.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,2.0,0.0,2.0,2.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0,2.0,1.0,0.0,2.0,1.0,2.0,0.0,2.0,1.0,2.0,0.0,1.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0,1.0,3.0,4.0,1.0,3.0,2.0,2.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,2.0,1.0,0.0,2.0,0.0,2.0,0.0,1.0,1.0,2.0,0.0,0.0,2.0,2.0,1.0,2.0,0.0,2.0,1.0,1.0,0.0,3.0,2.0,1.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,2.0,3.0,3.0,0.0,3.0,3.0,3.0,3.0,1.0,2.0,3.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,0.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,2.0,0.0,2.0,3.0,2.0,2.0,0.0,2.0,1.0,1.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,3.0,0.0,1.0,1.0,2.0,2.0,0.0,2.0,1.0,0.0,3.0,3.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,0.0,1.0,1.0,2.0,3.0,2.0,2.0,1.0,2.0,1.0,3.0,2.0,2.0,2.0,0.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,0.0,2.0
77571,0.111111,0.119525,0.659157,0.400558,0.466667,0.0,0.459729,0.376812,0.428725,0.036188,0.001703,0.333333,0.444444,0.111111,0.111111,0.062500,0.0,0.111111,0.119525,0.659157,0.400558,0.466667,0.0,0.376812,0.428725,0.036188,0.001703,0.333333,0.444444,0.111111,0.111111,0.062500,0.0,3.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,3.0,0.0,4.0,0.0,0.0,2.0,3.0,0.0,3.0,1.0,2.0,3.0,2.0,2.0,0.0,1.0,3.0,0.0,3.0,3.0,2.0,0.0,2.0,0.0,2.0,1.0,2.0,3.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,5.0,2.0,2.0,1.0,0.0,2.0,0.0,3.0,2.0,2.0,3.0,3.0,3.0,1.0,2.0,0.0,3.0,2.0,1.0,3.0,3.0,1.0,0.0,1.0,4.0,2.0,3.0,3.0,0.0,1.0,3.0,2.0,1.0,0.0,3.0,3.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0,2.0,0.0,1.0,1.0,3.0,3.0,0.0,3.0,2.0,1.0,0.0,2.0,0.0,1.0,3.0,1.0,2.0,2.0,0.0,1.0,0.0,3.0,1.0,0.0,1.0,0.0,0.0,3.0,2.0,1.0,3.0,1.0,2.0,1.0,0.0,2.0,3.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,3.0,0.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,3.0,2.0,0.0,0.0,1.0,2.0,1.0,2.0,0.0,1.0,2.0,0.0,3.0,1.0,1.0,0.0,1.0,0.0,3.0,1.0,2.0,1.0,3.0,2.0,3.0,0.0,2.0,1.0,0.0,2.0,2.0,0.0,1.0,2.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,0.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,2.0,0.0,0.0,1.0,3.0,3.0,3.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,2.0,3.0,3.0,2.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,0.0,1.0,3.0,0.0,3.0,2.0,3.0,1.0,2.0,2.0,0.0,1.0,1.0,2.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,1.0,1.0,0.0,2.0,1.0,3.0,2.0,0.0,1.0,3.0,2.0,2.0,3.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,3.0,0.0,2.0,1.0,3.0,3.0,0.0,2.0,2.0,1.0,3.0,1.0,2.0,0.0,0.0,2.0,3.0,0.0,2.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0,3.0,3.0,0.0,1.0,2.0,0.0,0.0,2.0,2.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,2.0,3.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,2.0,1.0,1.0,2.0,0.0,3.0,2.0,0.0,2.0,1.0,2.0,0.0,2.0,0.0,1.0,2.0,1.0,3.0,2.0,1.0,2.0,0.0,3.0,0.0,2.0,2.0,3.0,0.0,2.0,1.0,1.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,1.0,1.0,3.0,0.0,1.0,2.0,2.0,1.0,1.0,3.0,2.0,0.0,0.0,0.0,1.0,3.0,2.0,3.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,2.0,2.0,2.0,0.0,2.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,2.0,0.0,1.0,2.0,2.0,1.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,2.0,0.0,2.0,0.0,2.0,0.0,3.0,3.0,1.0,2.0,2.0,2.0,2.0,1.0,3.0,3.0,1.0,0.0,2.0,1.0,2.0,1.0,2.0,0.0,2.0,0.0,0.0,3.0,0.0,2.0,1.0,1.0,1.0,0.0,2.0,2.0,0.0,2.0,0.0,2.0,2.0,0.0,3.0,1.0,0.0,0.0,2.0,0.0,0.0,3.0,1.0,2.0,1.0,2.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,1.0,2.0,2.0,1.0,1.0,0.0,2.0,3.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0,3.0,2.0,0.0,2.0,1.0,0.0,2.0,3.0,0.0,0.0,2.0,0.0,2.0,1.0


### Use Recursive Feature Elimination to choose top performing features.

In [102]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=10)

# fit the data using RFE
rfe.fit(X_train_scaled,y_train.logerror)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [105]:
best_feats = rfe_feature

In [110]:
X_train_scaled[best_feats[:3]]

Unnamed: 0,cola,cluster_409,cluster_422
18253,0.0,1,0
29637,0.0,2,2
66783,0.0,0,1
58524,0.0,1,2
3965,0.0,3,1
...,...,...,...
29239,0.0,3,3
16805,1.0,0,3
38915,0.0,2,0
45130,0.0,0,1


# Now use the top five clusters on my OLS model.

In [111]:
y_train

Unnamed: 0,logerror,logerror_pred_mean,logerror_pred_median,logerror_pred_lm,logerror_pred_lars,logerror_pred_lm2,logerror_pred_rfe_top10
18253,-0.241830,0.017837,0.007131,0.032319,0.017837,0.017169,0.021851
29637,0.116972,0.017837,0.007131,0.015009,0.017837,0.016716,0.028931
66783,-0.021770,0.017837,0.007131,0.031397,0.017837,0.026762,0.020020
58524,-0.036804,0.017837,0.007131,0.015402,0.017837,0.019221,0.023071
3965,-0.022598,0.017837,0.007131,0.010482,0.017837,0.013891,0.015869
...,...,...,...,...,...,...,...
29239,-0.000416,0.017837,0.007131,0.022704,0.017837,0.033243,0.022949
16805,-0.012333,0.017837,0.007131,0.019006,0.017837,0.013145,0.012207
38915,-0.065222,0.017837,0.007131,0.024489,0.017837,0.022407,0.027466
45130,0.070491,0.017837,0.007131,0.016449,0.017837,0.010634,0.013916


In [170]:
# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train_scaled[best_feats], y_train.logerror)

# predict train
y_train['logerror_pred_selectkbest'] = lm2.predict(X_train_scaled[best_feats])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_lm2)**(1/2)

# predict validate
y_validate['logerror_pred_selectkbest'] = lm2.predict(X_validate_scaled[best_feats])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_lm2)**(1/2)

print("RMSE for OLS Model using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for OLS Model using LinearRegression
Training/In-Sample:  0.16778999021369378 
Validation/Out-of-Sample:  0.17056519876985646


In [171]:
y_train

Unnamed: 0,logerror,logerror_pred_mean,logerror_pred_median,logerror_pred_lm,logerror_pred_lars,logerror_pred_lm2,logerror_pred_rfe_top10,logerror_pred_rfe_top3,logerror_pred_selectkbest
18253,-0.241830,0.017837,0.007131,0.032319,0.017837,0.017169,0.021851,0.021851,0.017169
29637,0.116972,0.017837,0.007131,0.015009,0.017837,0.016716,0.028931,0.028931,0.016716
66783,-0.021770,0.017837,0.007131,0.031397,0.017837,0.026762,0.020020,0.020020,0.026762
58524,-0.036804,0.017837,0.007131,0.015402,0.017837,0.019221,0.023071,0.023071,0.019221
3965,-0.022598,0.017837,0.007131,0.010482,0.017837,0.013891,0.015869,0.015869,0.013891
...,...,...,...,...,...,...,...,...,...
29239,-0.000416,0.017837,0.007131,0.022704,0.017837,0.033243,0.022949,0.022949,0.033243
16805,-0.012333,0.017837,0.007131,0.019006,0.017837,0.013145,0.012207,0.012207,0.013145
38915,-0.065222,0.017837,0.007131,0.024489,0.017837,0.022407,0.027466,0.027466,0.022407
45130,0.070491,0.017837,0.007131,0.016449,0.017837,0.010634,0.013916,0.013916,0.010634


In [176]:
y_validate[['logerror','logerror_pred_selectkbest']]

Unnamed: 0,logerror,logerror_pred_selectkbest
68142,-0.065596,0.007319
76543,0.052794,0.023277
54903,-0.058549,-0.004735
38106,-0.012052,-0.000910
18005,-0.344629,-0.009091
...,...,...
6182,0.055281,0.023349
40343,0.049437,0.008374
74104,0.002362,0.034125
30559,0.015763,0.037391


# Going with Select K Best's top ten performing features

In [178]:
X_test_scaled

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,roomcnt,assessmentyear,age,age_bin,taxrate,acres,acres_bin,sqft_bin,structure_dollar_sqft_bin,lot_dollar_sqft_bin,bath_bed_ratio,cola
42377,0.222222,0.139155,0.332369,0.620181,0.000000,0.0,0.086957,0.142551,0.028926,0.004138,0.444444,0.444444,0.222222,0.222222,0.166667,0.0
5602,0.000000,0.085238,0.338989,0.713830,0.000000,0.0,0.463768,0.499464,0.029111,0.000723,0.111111,0.222222,0.222222,0.333333,0.027778,0.0
43594,0.111111,0.158262,0.622026,0.481283,0.000000,0.0,0.420290,0.428725,0.026664,0.002582,0.333333,0.555556,0.444444,0.333333,0.097222,1.0
32058,0.222222,0.081225,0.556751,0.529892,0.000000,0.0,0.318841,0.356913,0.026329,0.003062,0.333333,0.222222,0.444444,0.111111,0.270833,1.0
18021,0.111111,0.111324,0.559160,0.528242,0.000000,0.0,0.543478,0.571275,0.026488,0.000827,0.111111,0.444444,0.111111,0.333333,0.166667,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9033,0.000000,0.062293,0.518727,0.807608,0.000000,0.0,0.449275,0.499464,0.028723,0.001049,0.222222,0.111111,0.111111,0.333333,0.027778,0.0
64797,0.222222,0.118566,0.547002,0.663218,0.000000,0.0,0.681159,0.713826,0.026371,0.000919,0.222222,0.444444,0.333333,0.444444,0.114583,1.0
23162,0.222222,0.103036,0.423132,0.547123,0.000000,0.0,0.478261,0.499464,0.026551,0.000864,0.111111,0.333333,0.777778,0.444444,0.166667,1.0
15287,0.222222,0.174926,0.307641,0.806774,0.533333,0.0,0.442029,0.499464,0.029312,0.001007,0.222222,0.555556,0.111111,0.333333,0.114583,0.0


In [177]:
# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
#lm2.fit(X_train_scaled[best_feats], y_train.logerror)
lm2.fit(X_validate_scaled[best_feats], y_validate.logerror)
#lm2.fit(X_test_scaled[best_feats], y_test.logerror)

# predict train
y_train['logerror_pred_selectkbest'] = lm2.predict(X_train_scaled[best_feats])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_selectkbest)**(1/2)

# predict validate
y_validate['logerror_pred_selectkbest'] = lm2.predict(X_validate_scaled[best_feats])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_selectkbest)**(1/2)

# predict test
y_test['logerror_pred_selectkbest'] = lm2.predict(X_test_scaled[best_feats])

# evaluate: rmse
rmse_test = mean_squared_error(y_test.logerror, y_test.logerror_pred_selectkbest)**(1/2)

print("RMSE for OLS Model using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate,
      "\nTest/Out-of-Sample: ", rmse_test)

KeyError: "['cluster_186', 'cluster_457', 'cluster_95', 'cluster_451', 'cluster_516', 'cluster_317', 'cluster_499', 'cluster_408', 'cluster_123'] not in index"