In [1]:
%autosave 0

Autosave disabled


In [2]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from env import get_connection
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from scipy import stats
from wrangle import train_val_test


In [None]:
def acquire_data():
    url = get_connection('zillow')
    query = '''
            SELECT bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet,
                   taxvaluedollarcnt, yearbuilt, taxamount, fips
            FROM properties_2017
            JOIN propertylandusetype
                ON properties_2017.propertylandusetypeid = propertylandusetype.propertylandusetypeid
            WHERE propertylandusetype.propertylandusedesc = 'Single Family Residential';
            '''
    df = pd.read_sql(query, url)
    return df
df = acquire_data()
df.info()

In [None]:
acquire_data()

In [None]:
df.shape

In [None]:
df.isna().sum()

since bedroomcnt and bathroomcnt and taxvaluedollarcnt has the least nulls values, I decide dropping it would not affect the data. With the other features containing high null values, I will have to impute those with the mean of all observations.

In [None]:
def clean_and_impute_data(df):
    # Drop rows with null values in specified columns
    columns_to_drop_null = ['bedroomcnt', 'bathroomcnt', 'taxvaluedollarcnt']
    df.dropna(subset=columns_to_drop_null, inplace=True)
    
    # Impute null values with means for specified columns
    columns_to_impute = ['calculatedfinishedsquarefeet', 'yearbuilt', 'taxamount']
    for column in columns_to_impute:
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)
    
    return df

cleaned_df = clean_and_impute_data(df)


In [None]:
df.shape  #a few observations has been dropped

In [None]:
df.describe()

In [None]:
df.info() #everything is float so it is good

In [None]:
plt.figure(figsize=(16, 3))

# List of columns
cols = ['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'taxvaluedollarcnt', 'yearbuilt', 'taxamount', 'fips']

for i, col in enumerate(cols):

    # i starts at 0, but plot nos should start at 1
    plot_number = i + 1 

    # Create subplot.
    plt.subplot(1,10, plot_number)

    # Title with column name.
    plt.title(col)

    # Display histogram for column.
    df[col].hist(bins=5)

    # Hide gridlines.
    plt.grid(False)

In [None]:


plt.figure(figsize=(10,14))

# Create boxplots for all but student_id.
sns.boxplot(data=df.drop(columns=['fips']))
plt.show()

In [None]:
import os
import pandas as pd
from env import get_connection  # Make sure these variables are defined in env.py



def get_zillow_data():
    filename = "zillow.csv"

    if os.path.isfile(filename):
        return pd.read_csv(filename, index_col=0)
    else:
        # Create the url
        url = get_connection('zillow')

        # Read the SQL query into a dataframe
        query = '''
                SELECT bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet,
                       taxvaluedollarcnt, yearbuilt, taxamount, fips
                FROM properties_2017
                JOIN propertylandusetype
                    ON properties_2017.propertylandusetypeid = propertylandusetype.propertylandusetypeid
                WHERE propertylandusetype.propertylandusedesc = 'Single Family Residential';
                '''
        df = pd.read_sql(query, url)

        # Write the dataframe to disk for later. Called "caching" the data for later.
        df.to_csv(filename)

        # Return the dataframe to the calling code
        return df

def wrangle_zillow():
    '''
    Read Zillow data from database, drop rows with NaN values,
    convert all columns to int64 data types, and return cleaned DataFrame.
    '''
    zillow = get_zillow_data()

    # Drop all rows with NaN values.
    df = zillow.dropna()

    # Convert all columns to int64 data types.
    df = df.astype('int')

    return df


In [None]:
df = wrangle_zillow()

In [None]:
df.info()

In [None]:
#only fit scaling object to train data set
seed = 42

train, val_test = train_test_split(df, train_size = 0.8,
                                  random_state = seed)

val, test = train_test_split(val_test, train_size = 0.5,
                           random_state = seed)

train.shape, val.shape, test.shape


        # Columns to scale
        cols_to_scale = ['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'taxvaluedollarcnt', 'yearbuilt', 'taxamount']

        # Fit scaler on training data and transform all data splits
        StandardScaler.fit(df_train[cols_to_scale])
        df_train[cols_to_scale] = scaler.transform(df_train[cols_to_scale])
        df_validate[cols_to_scale] = scaler.transform(df_validate[cols_to_scale])
        df_test[cols_to_scale] = scaler.transform(df_test[cols_to_scale])

        return df_train, df_validate, df_test

def prepare_zillow_data():
    # Acquire and clean data
    df = acquire_data()
    df_cleaned = clean_and_impute_data(df)

    # Visualize cleaned data
    visualize_data(df_cleaned)

    # Split data into train, validate, and test sets

    # Apply scaling using MinMaxScaler
    df_train_scaled, df_validate_scaled, df_test_scaled = apply_scaling(df_train, df_validate, df_test, MinMaxScaler())

    return df_train_scaled, df_validate_scaled, df_test_scaled

In [None]:
for col in df.columns[df.dtypes == 'object']:
    plt.figure()
    sns.histplot(data = df, x=col)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
for col in df.columns[df.dtypes != 'object']:
    plt.figure()
    sns.countplot(data = df, x=col)
    plt.title(f'Count of {col}')
    plt.show()

In [None]:
train, val, test = train_val_test(df)
train.shape, val.shape, test.shape

In [None]:
taxvaluedollarcnt.unique()

Questions:

0. Does yearbuilt have a relationship with taxvaluedollarcnt?
0. Does squarefeeet have a relationship with taxvaluedollarcnt?
0. Does bedroomcnt have a relationship with taxvaluedollarcnt?

## Question 1 

Does year built have a relationship with tax?

year built are numerical and price are numerical, 

In [None]:
sns.barplot(data = train, x='yearbuilt', y='taxvaluedollarcnt')
plt.title("year vs tax")
plt.show() 


# Question 2 

does squarefeet have a relationshiop with taxvaluedollarcnt?

# Question 3

Does bedroom cnt have a relationship with taxvalueDollarcnt?