In [1]:
import pandas as pd
import numpy as np
 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('../data/train.csv')
train.shape

(6559830, 5)

In [None]:
weather = pd.read_csv('../data/weather.csv')
weather.shape

(20017278, 5)

In [None]:
weather['Timestamp'] = pd.to_datetime(weather['Timestamp'])
train['Timestamp'] = pd.to_datetime(train['Timestamp'])

In [None]:
train_weather = pd.merge(weather, train, how = 'right', on = ['SiteId', 'Timestamp'])

In [None]:
train_weather['Minutes'] = [time.minute for time in train_weather['Timestamp']]
train_weather['Hour'] = [time.hour for time in train_weather['Timestamp']]
train_weather['day'] = [time.day for time in train_weather['Timestamp']]
train_weather['wday'] = [time.wday for time in train_weather['Timestamp']]
train_weather['month'] = [time.month for time in train_weather['Timestamp']]
train_weather['Year'] = [time.year for time in train_weather['Timestamp']]

In [None]:
train_weather.head()

In [None]:
test = pd.read_csv('../data/submission_format.csv')
test_weather = pd.merge(weather, test, how = 'right', on = ['SiteId', 'Timestamp'])
test_weather['Timestamp'] = pd.to_datetime(test_weather['Timestamp'])

In [None]:
test_weather['Minutes'] = [time.minute for time in test_weather['Timestamp']]
test_weather['Hour'] = [time.hour for time in test_weather['Timestamp']]
test_weather['day'] = [time.day for time in test_weather['Timestamp']]
test_weather['wday'] = [time.wday for time in test_weather['Timestamp']]
test_weather['month'] = [time.month for time in test_weather['Timestamp']]
test_weather['Year'] = [time.year for time in test_weather['Timestamp']]



In [None]:
test_weather.head()

# Column Showing Difference in time between measurements

In [None]:
train_weather['time_diff'] = train_weather['Timestamp'].diff()

Put the time difference in terms of hours.

In [None]:
train_weather['time_diff'] = [(time.total_seconds()/3600) for time in train_weather['time_diff']]

In [None]:
averages = train_weather.group

In [None]:
sub = pd.read_csv('../data/submission_frequency.csv')
sub['hours'] = sub['ForecastPeriodNS'] / (3600 * 1e9)

In [None]:
sub.head(20)

## Sort first by timestamp and then by weather station distance

In [None]:
site_two = train_weather[train_weather['SiteId'] == 2].sort_values(['Timestamp', 'Distance'])

In [None]:
site_two.head()

## Fill NA temperature values

In [None]:
site_two['Temperature'] = site_two['Temperature'].fillna(-99)

## Drop Duplicates

In [None]:
site_two.drop_duplicates('Timestamp', keep='first', inplace=True)

In [None]:
site_two.head()

## Fill NA in value column

In [None]:
site_two['Value'] = site_two['Value'].fillna(method='ffill')

## Timestamp to Float for Now

In [None]:
site_two['Timestamp'] = [time.days * 3600 * 24 for time in (site_two['Timestamp'] - min(site_two['Timestamp']))]

# Drop Unused columns

In [None]:
site_two = site_two.drop(columns=['Unnamed: 0', 'Distance', 'SiteId', 'ForecastId', 'Value'])

# Processing Function

In [None]:
def process_site(site_id, data='train'):
    
    if data == 'train':
        df = train_weather[train_weather['SiteId'] == site_id].sort_values(['Timestamp', 'Distance'])
        
    else:
        df = test_weather[test_weather['SiteId'] == site_id].sort_values(['Timestamp', 'Distance'])
    
    min_date = min(train_weather[train_weather['SiteId'] == site_id]['Timestamp'])
    
    df['Temperature'] = df['Temperature'].fillna(-99)
    
    df = df.drop_duplicates('Timestamp', keep = 'first')
    
    labels = df['Value'].fillna(method='ffill')
    labels = labels.fillna(0)
    
    df = df.drop(columns = ['Unnamed: 0', 'Distance', 'SiteId', 'ForecastId', 'Value'])
    
    df['Timestamp'] = [(time.days*3600*24) for time in (df['Timestamp'] - min_date)] 
    df = df.fillna(-99)
    
    return df, labels

In [None]:
site_one_df, labels = process_site(1)
site_one_df.head()

# Linear Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_model = LinearRegression()
lin_model.fit(site_one_df, labels)

# Submission Data

In [None]:
test_weather_df, test_labels = process_site(1, data='test')

In [None]:
test_weather_df.head()

## Make Predictions

In [None]:
from itertools import chain
from IPython.display import clear_output

In [None]:
predictions = lin_model.predict(test_weather_df)

In [None]:
len(predictions)

In [None]:
import sys
import time

In [None]:
predictions = []

for id in set(test['SiteId']):
    print(id)
    
    lin_model = LinearRegression()
    train_x, train_y = process_site(id, data = 'train')
    test_x, test_y = process_site(id, data = 'test')
    
    lin_model.fit(train_x, train_y)
    predicted = lin_model.predict(test_x)
    predictions.append(predicted)
    
    clear_output()

In [None]:
test_submit = pd.read_csv('../data/submission_format.csv')

In [None]:
predictions_flat = list(chain(*predictions))

# Random Forest

In [None]:
from IPython.display import clear_output
import sys

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_predictions = []

for i, id in enumerate(set(test['SiteId'])):

    print('Percentage Complete: {:.2f}'.format(100 * i / len(set(test['SiteId']))))
    tree_reg = RandomForestRegressor(n_estimators=500)
    train_x, train_y = process_site(id, data = 'train')
    test_x, test_y = process_site(id, data = 'test')
    
    tree_reg.fit(train_x, train_y)
    predicted = lin_model.predict(test_x)
    clear_output()
    rf_predictions.append(predicted)

In [None]:
rf_predictions = list(chain(*predictions))
test_submit = pd.read_csv('../data/submission_format.csv')
test_submit['Value'] = rf_predictions

In [None]:
test_submit.to_csv('../submissions/second_rf.csv', index=False)

# Feature Engineering