# Exploration notebook

* [Imports](#imports)
* [Data loading](#data-loading)
* [Missing values](#missing-values)
    * [Quantification](#quantification)
    * [Imputation v1](#imputation-v1)
* [Data filtering](#data-filtering)
    * [Row filter v1](#row-filter-v1)
    * [Column filter v1](#column-filter-v1)
* [Distributions](#distributions)
    * [Numerical features](#numerical-features)
    * [Categorical features](#categorical-features)
* [Correlations](#correlations)
* [Feature engineering](#feature-engineering)
    * [Feature engineering v1](#feature-engineering-v1)
* [Exports](#exports)


<a name="imports"></a>
## Imports

In [None]:
import os
import pandas as pd
import numpy as np
import missingno as msno

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from scipy import stats

<a name="data-loading"></a>
## Data loading

In [None]:
file_name = "data.csv"
if not os.path.exists(file_name):
    !wget https://s3.eu-west-1.amazonaws.com/course.oc-static.com/projects/Data_Scientist_P4/2016_Building_Energy_Benchmarking.csv -q --show-progress -O $file_name
!head -2 $file_name

In [None]:
df = pd.read_csv(file_name)
print(df.columns)
df.head()

<a name="missing-values"></a>
## Missing values

<a name="quantification"></a>
### Quantification 

In [None]:
df.isna().sum()

In [None]:
msno.heatmap(df)

<a name="imputation-v1"></a>
### Imputation v1

In [None]:
df_v1 = msno.nullity_filter(df, 'top', 0.5)

In [None]:
df_v1['ENERGYSTARScore'].describe()

In [None]:
df_v1.loc[df_v1['ENERGYSTARScore'].isna(),'ENERGYSTARScore'] = 0

In [None]:
df_v1 = df_v1.dropna()

<a name="data-filtering"></a>
## Data filtering

<a name="row-filter-v1"></a>
### Row filter v1

Drop the "multifamily" building type to only keep the non-resedential buildings.

In [None]:
to_drop = [val for val in df_v1['BuildingType'].unique() if 'Multifamily' in val]
for val in to_drop:
    df_v1 = df_v1[df_v1['BuildingType'] != val]

<a name="column-filter-v1"></a>
### Column filter v1

In [None]:
df_v1.info()

In [None]:
df_v1 = df_v1.drop(columns=['OSEBuildingID',
                            'DataYear', 
                            'BuildingType',
                            'PropertyName', 
                            'Address', 
                            'City', 
                            'State', 
                            'ZipCode',
                            'TaxParcelIdentificationNumber', 
                            'CouncilDistrictCode', 
                            'Neighborhood',
                            'ComplianceStatus',
                            'DefaultData'])

In [None]:
print(len(df_v1['PrimaryPropertyType'].unique()))
print(len(df_v1['ListOfAllPropertyUseTypes'].unique()))
print(len(df_v1['LargestPropertyUseType'].unique()))

In [None]:
df_v1 = df_v1.drop('ListOfAllPropertyUseTypes', axis=1)

<a name="distributions"></a>
## Distributions

<a name="numerical-features"></a>
### Numerical features

In [None]:
df_num = df_v1._get_numeric_data()

In [None]:
df_desc = df_num.describe()
df_desc.loc['var'] = df_num.var().tolist()
df_desc.loc['skew'] = df_num.skew().tolist()
df_desc.loc['kurt'] = df_num.kurtosis().tolist()
df_desc

In [None]:
sns.displot(x=df_v1.loc[df_v1["TotalGHGEmissions"]>0,"TotalGHGEmissions"], kind="kde", log_scale=True)

In [None]:
sm.qqplot(df_v1["TotalGHGEmissions"], stats.lognorm, fit=True, line="45")
plt.show()

<a name="categorical-features"></a>
### Categorical features

In [None]:
df_grp = df_v1.groupby('PrimaryPropertyType').size()
per_lim =0.03
df_grp = df_grp[df_grp > per_lim*len(df_v1)]
df_grp.plot(kind='pie', autopct='%.2f')

<a name="correlations"></a>
## Correlations

In [None]:
corr = df_v1.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
df_v1.info()

In [None]:
sns.lineplot(data=df_v1, x="YearBuilt", y="TotalGHGEmissions")

In [None]:
e_sum = df_v1["Electricity(kBtu)"] \
        + df_v1["NaturalGas(kBtu)"] \
        + df_v1["SteamUse(kBtu)"] 
sns.scatterplot(data=df_v1, x="SiteEnergyUseWN(kBtu)", y=e_sum)

In [None]:
res = stats.linregress(df_v1["SiteEnergyUse(kBtu)"], e_sum)
res

In [None]:
sns.scatterplot(data=df_v1, x="TotalGHGEmissions", y=e_sum)

In [None]:
res = stats.linregress(df_v1["TotalGHGEmissions"], e_sum)
res

In [None]:
gfa_sum = df_v1["PropertyGFAParking"] \
        + df_v1["PropertyGFABuilding(s)"]
sns.scatterplot(data=df_v1, x="PropertyGFATotal", y=gfa_sum)

In [None]:
res = stats.linregress(df_v1["PropertyGFATotal"], gfa_sum)
res

In [None]:
df_v1.drop(columns=["PropertyGFATotal"])

<a name="feature-engineering"></a>
## Feature engineering

<a name="feature-engineering-v1"></a>
### Feature engineering v1

In [None]:
df_v1["ENERGYSTARScore_isna"] = df_v1["ENERGYSTARScore"] > 0

In [None]:
df_v1.groupby("ENERGYSTARScore_isna")["ENERGYSTARScore_isna"].count()

<a name="exports"></a>
### Exports

In [None]:
df_v1.to_csv("data_cleaned_v1.csv", index=False)