# Exploration notebook

* [Imports](#imports)
* [Data loading](#data-loading)
* [Missing values](#missing-values)
    * [Quantification](#quantification)
    * [Imputation](#imputation)
* [Data filtering](#data-filtering)
    * [Row filter](#row-filter)
    * [Column filter](#column-filter)
* [Distributions](#distributions)
    * [Numerical features](#numerical-features)
    * [Categorical features](#categorical-features)
* [Correlations](#correlations)
* [Feature engineering](#feature-engineering)
    * [Feature engineering](#feature-engineering)
* [Exports](#exports)


<a name="imports"></a>
## Imports

In [None]:
import os
import pandas as pd
import numpy as np
import missingno as msno

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from scipy import stats

<a name="data-loading"></a>
## Data loading

In [None]:
file_name = "data.csv"
if not os.path.exists(file_name):
    !wget https://s3.eu-west-1.amazonaws.com/course.oc-static.com/projects/Data_Scientist_P4/2016_Building_Energy_Benchmarking.csv -q --show-progress -O $file_name
!head -2 $file_name

In [None]:
df = pd.read_csv(file_name)
print(df.columns)
df.head()

<a name="missing-values"></a>
## Missing values

<a name="quantification"></a>
### Quantification 

In [None]:
df.isna().sum()

In [None]:
msno.heatmap(df)

<a name="imputation"></a>
### Imputation

In [None]:
df_v2 = msno.nullity_filter(df, 'top', 0.5)

In [None]:
df_v2['ENERGYSTARScore'].describe().round(2)

<a name="data-filtering"></a>
## Data filtering

<a name="row-filter"></a>
### Row filter

Drop the "multifamily" building type to only keep the non-resedential buildings.

In [None]:
to_drop = [val for val in df_v2['BuildingType'].unique() if 'Multifamily' in val]
for val in to_drop:
    df_v2 = df_v2[df_v2['BuildingType'] != val]

Drop the negative energy building (energy production).

In [None]:
print((df_v2["TotalGHGEmissions"]<=0).sum())

In [None]:
print((df_v2["SiteEnergyUseWN(kBtu)"]<=0).sum())

In [None]:
print(((df_v2["SiteEnergyUseWN(kBtu)"]<=0)|(df_v2["TotalGHGEmissions"]<=0)).sum())

In [None]:
print((df_v2["SteamUse(kBtu)"]<0).sum())

In [None]:
print((df_v2["Electricity(kBtu)"]<=0).sum())

In [None]:
print(((df_v2["SiteEnergyUseWN(kBtu)"]<=0)|(df_v2["TotalGHGEmissions"]<=0)|(df_v2["Electricity(kBtu)"]<0)).sum())

In [None]:
print((df_v2["NaturalGas(kBtu)"]<0).sum())

In [None]:
df_v2 = df_v2[(df_v2["TotalGHGEmissions"] > 0) & (df_v2["SiteEnergyUseWN(kBtu)"]>0) & (df_v2["Electricity(kBtu)"]>0)]

<a name="column-filter"></a>
### Column filter

In [None]:
df_v2.info()

In [None]:
df_v2 = df_v2.drop(columns=['OSEBuildingID',
                            'DataYear', 
                            'PropertyName', 
                            'Address', 
                            'City', 
                            'State', 
                            'ZipCode',
                            'TaxParcelIdentificationNumber', 
                            'CouncilDistrictCode', 
                            'Neighborhood',
                            'ComplianceStatus',
                            'DefaultData'])

In [None]:
print(len(df_v2['BuildingType'].unique()))
print(len(df_v2['PrimaryPropertyType'].unique()))
print(len(df_v2['ListOfAllPropertyUseTypes'].unique()))
print(len(df_v2['LargestPropertyUseType'].unique()))

In [None]:
df_v2 = df_v2.drop('ListOfAllPropertyUseTypes', axis=1)

<a name="distributions"></a>
## Distributions

<a name="numerical-features"></a>
### Numerical features

In [None]:
df_num = df_v2._get_numeric_data()

In [None]:
df_desc = df_num.describe()
df_desc.loc['var'] = df_num.var().tolist()
df_desc.loc['skew'] = df_num.skew().tolist()
df_desc.loc['kurt'] = df_num.kurtosis().tolist()
df_desc.round(2)

In [None]:
sns.pairplot(df_v2.sample(frac=0.1), corner=True)

In [None]:
sns.displot(x=df_v2.loc[df_v2["TotalGHGEmissions"]>0,"TotalGHGEmissions"], kind="kde", log_scale=True)

In [None]:
sm.qqplot(df_v2["TotalGHGEmissions"].dropna(), stats.lognorm, fit=True, line="45")
plt.show()

<a name="categorical-features"></a>
### Categorical features

In [None]:
df_grp = df_v2.groupby('PrimaryPropertyType').size()
per_lim =0.03
df_grp = df_grp[df_grp > per_lim*len(df_v2)]
df_grp.plot(kind='pie', autopct='%.2f')

<a name="correlations"></a>
## Correlations

In [None]:
corr = df_v2.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
df_v2.info()

In [None]:
sns.lineplot(data=df_v2, x="YearBuilt", y="TotalGHGEmissions")

In [None]:
e_sum = df_v2["Electricity(kBtu)"] \
        + df_v2["NaturalGas(kBtu)"] \
        + df_v2["SteamUse(kBtu)"] 
sns.scatterplot(data=df_v2, x="SiteEnergyUseWN(kBtu)", y=e_sum)

In [None]:
res = stats.linregress(df_v2["SiteEnergyUse(kBtu)"], e_sum)
res

In [None]:
sns.scatterplot(data=df_v2, x="TotalGHGEmissions", y=e_sum)

In [None]:
res = stats.linregress(df_v2["TotalGHGEmissions"], e_sum)
res

In [None]:
gfa_sum = df_v2["PropertyGFAParking"] \
        + df_v2["PropertyGFABuilding(s)"]
sns.scatterplot(data=df_v2, x="PropertyGFATotal", y=gfa_sum)

In [None]:
res = stats.linregress(df_v2["PropertyGFATotal"], gfa_sum)
res

In [None]:
df_v2 = df_v2.drop(columns=["PropertyGFATotal"])

In [None]:
sns.scatterplot(data=df_v2, x="PropertyGFAParking", y="PropertyGFABuilding(s)")

In [None]:
sns.scatterplot(data=df_v2, x="PropertyGFABuilding(s)", y="TotalGHGEmissions")

In [None]:
sns.scatterplot(data=df_v2, x="PropertyGFABuilding(s)", y="SiteEnergyUse(kBtu)")

<a name="feature-engineering"></a>
## Feature engineering

<a name="feature-engineering"></a>
### Feature engineering

In [None]:
df_v2["is_ENERGYSTARScore"] = df_v2["ENERGYSTARScore"].isna()

In [None]:
df_v2.groupby("is_ENERGYSTARScore")["is_ENERGYSTARScore"].count()

In [None]:
df_v2["is_SteamUse"] = df_v2["SteamUse(kBtu)"] > 0

In [None]:
df_v2.groupby("is_SteamUse")["is_SteamUse"].count()

In [None]:
df_v2["is_NaturalGas"] = df_v2["NaturalGas(kBtu)"] > 0

In [None]:
df_v2.groupby("is_NaturalGas")["is_NaturalGas"].count()

In [None]:
df_v2["is_PropertyGFAParking"] = df_v2["PropertyGFAParking"] > 0

In [None]:
df_v2.groupby("is_PropertyGFAParking")["is_PropertyGFAParking"].count()

In [None]:
df_v2["ratio_SteamUse"] = (df_v2["SteamUse(kBtu)"]/(df_v2["SteamUse(kBtu)"]+df_v2["NaturalGas(kBtu)"]+df_v2["Electricity(kBtu)"])).round(1)

In [None]:
df_v2["ratio_NaturalGas"] = (df_v2["NaturalGas(kBtu)"]/(df_v2["SteamUse(kBtu)"]+df_v2["NaturalGas(kBtu)"]+df_v2["Electricity(kBtu)"])).round(1)

<a name="exports"></a>
### Exports

In [None]:
df_v2.to_csv("data_cleaned_v2.csv", index=False)