In [2]:
import wrangle_final

import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from env import user, password, host
import env
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import csv
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
zillow_train,zillow_validate,zillow_test=wrangle.wrangled_file()

In [None]:
zillow_train.info()

In [None]:
zillow_train.describe().T

In [None]:
def scale(scaler,zillow_train,cols=['bedroom','bathroom','sqtft','fullbathcnt',],return_scaler=True):
    zillow_train=zillow_train
    zillow_train_scaled=zillow_train.copy()
    scaler=scaler
    zillow_train_scaled[cols] = scaler.fit_transform(zillow_train[cols])
    return scaler, zillow_train_scaled

In [None]:
scaler,zillow_scaled=scale(MinMaxScaler(),zillow_train)

In [None]:
zillow_scaled.info()

In [None]:
zillow_scaled.describe().T

## Question 1
- Whether there has relationship between LogError and Age

In [None]:
# plot age by logerror
sns.scatterplot(x=zillow_train.age, y=zillow_train.logerror,hue=zillow_train.county)
plt.xlabel("Age")
plt.ylabel("LogError")
plt.title("LogError and Age")
plt.show()

## Key Takeaway
- The age and log error don't seem to have any significant relationship in county when they pass age 80.
- Majority of them within 60 years built has significant log error with orange county
- The next step exploration would be whether there are signicant relationship with the taxrate and logerror

## Question 2
- Whether there has relationship between taxrate and logerror 

In [None]:
sns.jointplot(y="taxrate", x="logerror", data=zillow_train, hue = 'county')
plt.xlabel("county")
plt.ylabel("Logerror")
plt.show()

## Key Takeaway
- The logerror has showing significant with orange county, which is like what happened to the age
- The next step would be focusing on orange county to explore further for the reason of logerror

In [None]:
alpha = .05

In [None]:
zillow_train.county.value_counts()

In [None]:
import scipy.stats as stats
orange = zillow_train[zillow_train.county== 'Orange County, CA'].logerror
other = zillow_train[zillow_train.county!='Orange County, CA'].logerror
orange.var(), other.var()

In [None]:
t, p = stats.ttest_ind(orange, other, equal_var=False)
print(t, p/2, alpha)

In [None]:
if p/2 > alpha:
    print("We fail to reject H0")
elif t < 0:
    print("We fail to reject H0")
else:
    print("We reject H0 ")

## Key Takeaway
- The orange county has significant different on the logerror compare to the other counties
- The orange county has 59.9% variable higher than other countiers 

## Question 4
- Is there a significant different between logerror and the bathroom and bedroom counts 

In [None]:
#creating a joinplot to see if any linear relationship
print("Is there a relationship\nbetween room count and logerror?")
room=zillow_train['bathroom']+zillow_train['bedroom']
sns.jointplot(x=room, y='logerror', data=zillow_train, hue = 'county')
plt.xlabel("Room")
plt.ylabel("Logerror")
plt.show()

In [None]:
zillow_train[zillow_train.logerror>=1.5].county.value_counts()

## Key Takeaway
- The orange county has significant higher logerror
- The orange county has higher logerror with room count between 3-8
- The logerror plays significant role on logerror
- Would recommend doing the prediction seperately with orange county to be one and rest of the county to be the other one

## Question 5
- Can we achieve lower logerror when seperate orange county out of overall 

In [None]:
zillow_train.groupby('county').logerror.mean().plot.bar()

In [None]:
nroom=zillow_train.bathroom+zillow_train.bedroom
zillow_train['county1']=zillow_train['county'].replace(['Los Angeles County, CA', 'Ventura County, CA'], 'Other')

In [None]:
sns.scatterplot(x=nroom, y='logerror', data=zillow_train, hue='county1')
plt.xlabel("Room")
plt.ylabel("Logerror")
plt.show()

In [None]:
zillow_train.logerror.mean()-zillow_train[zillow_train.county1=='Other'].logerror.mean()


## Key Takeaway
- If we remove the orange county, the logerror doesn't have significant change
- The modeling would not remove the orange county

# Cluster

In [None]:
scaled_train,scaled_validate,scaled_test=wrangle.scale_data(zillow_train,zillow_validate,zillow_test,cols=['bathroom','bedroom','taxrate','month','age','sqtft'])

In [None]:
# building and X to start clustering
from sklearn.cluster import KMeans
X = scaled_train[['bathroom','bedroom','taxrate','month','age','sqtft']]
with plt.style.context('seaborn-whitegrid'):
    #graph size
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

kmeans.predict(X)
scaled_train['cluster']= kmeans.predict(X)


In [None]:
zillow_train.T

In [None]:
sns.relplot(data=scaled_train, 
            x='bathroom', 
            y='sqtft',
           hue='cluster');

In [None]:
sns.relplot(data=scaled_train, 
            x='month', 
            y='sqtft',
           hue='cluster');

In [None]:
sns.relplot(data=scaled_train, 
            x='sqtft', 
            y='logerror',
           hue='cluster');

In [None]:
#graph size 
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
plt.figure(figsize=(14, 9))
#graphing after weve clustered using a scatter plot
for cluster, subset in scaled_train.groupby('cluster'):

        plt.scatter(subset.sqtft, subset.taxrate, label='cluster ' + str(cluster), alpha=.6)

        plt.legend()
        plt.xlabel('price')
        plt.ylabel('sq ft')
        plt.title('Visualizing Cluster Centers')