In [105]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
warnings.simplefilter('ignore')
matplotlib.rcParams['figure.dpi'] = 100
sns.set()
%matplotlib inline

building = pd.read_csv('building_metadata.csv')
weather_train = pd.read_csv('weather_train.csv')
weather_test = pd.read_csv('weather_test.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [106]:
#Before we merge the dataset into one training dataset, we need to preprocessing the missing value first
def missing_statistics(df):    
    statitics = pd.DataFrame(df.isnull().sum()).reset_index()
    statitics.columns=['COLUMN NAME',"MISSING VALUES"]
    statitics['TOTAL ROWS'] = df.shape[0]
    statitics['% MISSING'] = round((statitics['MISSING VALUES']/statitics['TOTAL ROWS'])*100,2)
    return statitics

def fill_missing_column(df,filler_df,col):
    null_df = df.loc[df[col].isnull()]
    
    if null_df.empty != True:
        null_df[col] = null_df.apply(lambda x: filler_df.loc[x['site_id']][x['day']][x['month']], axis=1)
        df.loc[null_df.index, col] = null_df[col]
    
    return df

In [86]:
missing_statistics(building)

Unnamed: 0,COLUMN NAME,MISSING VALUES,TOTAL ROWS,% MISSING
0,site_id,0,1449,0.0
1,building_id,0,1449,0.0
2,primary_use,0,1449,0.0
3,square_feet,0,1449,0.0
4,year_built,774,1449,53.42
5,floor_count,1094,1449,75.5


In [87]:
missing_statistics(weather_train)

Unnamed: 0,COLUMN NAME,MISSING VALUES,TOTAL ROWS,% MISSING
0,site_id,0,139773,0.0
1,timestamp,0,139773,0.0
2,air_temperature,55,139773,0.04
3,cloud_coverage,69173,139773,49.49
4,dew_temperature,113,139773,0.08
5,precip_depth_1_hr,50289,139773,35.98
6,sea_level_pressure,10618,139773,7.6
7,wind_direction,6268,139773,4.48
8,wind_speed,304,139773,0.22


In [88]:
missing_statistics(train)

Unnamed: 0,COLUMN NAME,MISSING VALUES,TOTAL ROWS,% MISSING
0,building_id,0,20216100,0.0
1,meter,0,20216100,0.0
2,timestamp,0,20216100,0.0
3,meter_reading,0,20216100,0.0


In [107]:
def fill_weather_dataset(weather_df):
    
    # Add new Features
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["timestamp"].dt.day
    weather_df["week"] = weather_df["timestamp"].dt.week
    weather_df["month"] = weather_df["timestamp"].dt.month
    
    air_temperature_filler = weather_df.groupby(['site_id','day','month'])['air_temperature'].mean()
    weather_df = fill_missing_column(weather_df,air_temperature_filler,'air_temperature')    

    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    cloud_coverage_filler = cloud_coverage_filler.fillna('ffill')
    weather_df = fill_missing_column(weather_df,cloud_coverage_filler,'cloud_coverage')

    due_temperature_filler = weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean()
    weather_df = fill_missing_column(weather_df,due_temperature_filler,'dew_temperature')

    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    sea_level_filler = sea_level_filler.fillna('ffill')
    weather_df = fill_missing_column(weather_df,sea_level_filler,'sea_level_pressure')

    wind_direction_filler = weather_df.groupby(['site_id','day','month'])['wind_direction'].mean()
    weather_df = fill_missing_column(weather_df,wind_direction_filler,'wind_direction')

    wind_speed_filler = weather_df.groupby(['site_id','day','month'])['wind_speed'].mean()
    weather_df = fill_missing_column(weather_df,wind_speed_filler,'wind_speed')

    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    precip_depth_filler = precip_depth_filler.fillna('ffill')
    weather_df = fill_missing_column(weather_df,precip_depth_filler,'precip_depth_1_hr')

    weather_df.drop(['day','week','month'],axis=1)
    
    return weather_df

weather_train = fill_weather_dataset(weather_train)

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,day,week,month
0,0,2016-01-01 00:00:00,25.0,6,20.0,-0.173913,1019.7,0.0,0.0,1,53,1
1,0,2016-01-01 01:00:00,24.4,4.28571,21.1,-1,1020.2,70.0,1.5,1,53,1
2,0,2016-01-01 02:00:00,22.8,2,21.1,0,1020.2,0.0,0.0,1,53,1
3,0,2016-01-01 03:00:00,21.1,2,20.6,0,1020.1,0.0,0.0,1,53,1
4,0,2016-01-01 04:00:00,20.0,2,20.0,-1,1020,250.0,2.6,1,53,1
5,0,2016-01-01 05:00:00,19.4,4.28571,19.4,0,1018.93,0.0,0.0,1,53,1
6,0,2016-01-01 06:00:00,21.1,6,21.1,-1,1019.4,0.0,0.0,1,53,1
7,0,2016-01-01 07:00:00,21.1,4.28571,21.1,0,1018.8,210.0,1.5,1,53,1
8,0,2016-01-01 08:00:00,20.6,4.28571,20.0,0,1018.1,0.0,0.0,1,53,1
9,0,2016-01-01 09:00:00,21.1,4.28571,20.6,0,1019,290.0,1.5,1,53,1


In [108]:
#Below is the statistics table of missing value in each given dataset
missing_statistics(building)

Unnamed: 0,COLUMN NAME,MISSING VALUES,TOTAL ROWS,% MISSING
0,site_id,0,1449,0.0
1,building_id,0,1449,0.0
2,primary_use,0,1449,0.0
3,square_feet,0,1449,0.0
4,year_built,774,1449,53.42
5,floor_count,1094,1449,75.5


In [109]:
missing_statistics(weather_train)

Unnamed: 0,COLUMN NAME,MISSING VALUES,TOTAL ROWS,% MISSING
0,site_id,0,139773,0.0
1,timestamp,0,139773,0.0
2,air_temperature,0,139773,0.0
3,cloud_coverage,0,139773,0.0
4,dew_temperature,0,139773,0.0
5,precip_depth_1_hr,0,139773,0.0
6,sea_level_pressure,0,139773,0.0
7,wind_direction,0,139773,0.0
8,wind_speed,0,139773,0.0
9,day,0,139773,0.0


In [110]:
missing_statistics(train)

Unnamed: 0,COLUMN NAME,MISSING VALUES,TOTAL ROWS,% MISSING
0,building_id,0,20216100,0.0
1,meter,0,20216100,0.0
2,timestamp,0,20216100,0.0
3,meter_reading,0,20216100,0.0


In [52]:
# Merge datasets into one train or test dataset
train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')

train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')
del weather_train, weather_test,building
gc.collect()

200

In [45]:
def fill_missingValue(data, N):
    mis_data = np.nonzero(pd.isnull(data))
    for i in range(len(mis_data)):
        #win_mean = (data.iloc[i-N: i+N]).mean()
        print(data.iloc[mis_data[i]-5: mis_data[i]+5])
        # Preprocessing features with missing values
fill_missingValue(train['air_temperature'], 5)
#Find the index of missing values

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.numeric.Int64Index'> with these indexers [[     717      718      719 ... 20201026 20201027 20201028]] of <class 'numpy.ndarray'>

In [8]:
train.isnull().sum()

building_id                  0
meter                        0
timestamp                    0
meter_reading                0
site_id                      0
primary_use                  0
square_feet                  0
year_built            12127645
floor_count           16709167
air_temperature          96658
cloud_coverage         8825365
dew_temperature         100140
precip_depth_1_hr      3749023
sea_level_pressure     1231669
wind_direction         1449048
wind_speed              143676
dtype: int64

In [32]:
for i in np.nonzero(pd.isnull(train['air_temperature'])):
    print(train['air_temperature'][i])

722        NaN
723        NaN
724        NaN
725        NaN
726        NaN
727        NaN
728        NaN
729        NaN
730        NaN
731        NaN
732        NaN
733        NaN
734        NaN
735        NaN
736        NaN
737        NaN
738        NaN
739        NaN
740        NaN
741        NaN
742        NaN
743        NaN
744        NaN
745        NaN
746        NaN
747        NaN
748        NaN
749        NaN
750        NaN
751        NaN
            ..
20200583   NaN
20200584   NaN
20200585   NaN
20200586   NaN
20200587   NaN
20200588   NaN
20200589   NaN
20200590   NaN
20200591   NaN
20200592   NaN
20200593   NaN
20200594   NaN
20200595   NaN
20200596   NaN
20200597   NaN
20200598   NaN
20201020   NaN
20201021   NaN
20201022   NaN
20201023   NaN
20201024   NaN
20201025   NaN
20201026   NaN
20201027   NaN
20201028   NaN
20201029   NaN
20201030   NaN
20201031   NaN
20201032   NaN
20201033   NaN
Name: air_temperature, Length: 96658, dtype: float64


In [54]:
missing_statistics(train)

Unnamed: 0,COLUMN NAME,MISSING VALUES,TOTAL ROWS,% MISSING
0,building_id,0,20216100,0.0
1,meter,0,20216100,0.0
2,timestamp,0,20216100,0.0
3,meter_reading,0,20216100,0.0
4,site_id,0,20216100,0.0
5,primary_use,0,20216100,0.0
6,square_feet,0,20216100,0.0
7,year_built,12127645,20216100,59.99
8,floor_count,16709167,20216100,82.65
9,air_temperature,96658,20216100,0.48


In [83]:
#export_csv = weather_train.to_csv (r'C:\Desktop\newWeather_train.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path