## Gas Cleaned Dataset

**Summary**
* Converted timestamp object to datetime 
    * All other types are float64
* Deleted columns with more than 50% missing values 
* Used interpolation to fill in missing values 
    * Gas shows linear usage according to GitHub documentation so performed slinear interpolation for a basic linear interpolation 
* Used back propagation filling to fill Panther_education_Teofila’s 2923 missing values


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#imports
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno

In [None]:
#load dataset 
gas = pd.read_csv("/kaggle/input/buildingdatagenomeproject2/gas_cleaned.csv")

In [None]:
gas.info()

In [None]:
gas.head()

In [None]:
gas.shape

In [None]:
#show types of the values 
gas.dtypes

In [None]:
#change to DateTime format
gas["timestamp"] = pd.to_datetime(gas["timestamp"], format = "%Y-%m-%d %H:%M:%S")

In [None]:
#show types of the values 
#check that changing to DateTime format worked
gas.dtypes

In [None]:
#checked for misssing values 
gas.isnull().sum()

In [None]:
#to visualize missing values 
msno.matrix(gas)

In [None]:
#function shows the percentage of missing values and type of the values
def missing_data(data):
    percent = (data.isnull().sum() / data.isnull().count())
    x = pd.concat([percent], axis=1, keys=['Percentage_of_Missing_Values'])
    type = []
    
    for col in data.columns:
        dtype = str(data[col].dtype)
        type.append(dtype)
    x['Data Type'] = type
    
    return(np.transpose(x))

In [None]:
missing_data(gas)

In [None]:
temp = missing_data(gas)
col_names = temp.T.query('Percentage_of_Missing_Values > 0.5').index

In [None]:
gas[col_names]

In [None]:
#removed the columns/locations with more than 50% missing values 
gas_cleaned = gas.drop(gas[col_names], axis = 1)

In [None]:
gas_cleaned.head()

In [None]:
gas_cleaned.shape

In [None]:
#to visualize missing values 
msno.matrix(gas_cleaned)

In [None]:
#interpolate 
gas_cleaned = gas_cleaned.interpolate(method='slinear')

In [None]:
gas_cleaned.isnull().sum()

In [None]:
#to visualize missing values 
msno.matrix(gas_cleaned)

In [None]:
#Last column with missing values 
gas_cleaned["Panther_education_Teofila"].isnull().sum()

In [None]:
#shows the number of non-zero values per column 
gas_cleaned.loc[:, gas_cleaned.columns != 'timestamp'].astype(bool).sum(axis=0)

In [None]:
#back propagation fill of Panther_education_Teofila 
gas_cleaned = gas_cleaned.fillna('bfill')

In [None]:
#to visualize missing values 
msno.matrix(gas_cleaned)

In [None]:
#save as csv
gas_cleaned.to_csv('gas_cleaned_new.csv')