In [187]:
#Imports
#%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    #DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Applied-Modeling/master/data/'
    !pip install category_encoders==2.*
    !pip install eli5
    !pip install pdpbox
    !pip install shap

# If you're working locally:
else:
    DATA_PATH = '../data/'

import datetime
import pandas as pd
import numpy as np

from category_encoders import OrdinalEncoder, OneHotEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
#Yes today (Regression)
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
#Not today (Classification)
#Tomorrow, both
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.inspection import permutation_importance
#import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import matplotlib.pyplot as plt

In [4]:
#Raw meterorological data
met_readings = pd.read_csv('BW_MET46251.txt', sep=' ', skipinitialspace=True)

#Basically government-made features made out of spectral data
directions = pd.read_csv('BW_SPEC46251.spec.txt', sep=' ', skipinitialspace=True)


def timeFixer(df):
    #first row is weird
    row1 = df.index[0]
    #drop weird first row
    df.drop(df.index[0],inplace=True)
    #add 00 for seconds
    df['ss'] = '00'
    #create date column
    df['Date'] = df['#YY']+'/'+df['MM']+'/'+df['DD']+' '+df['hh']+':'+df['mm']+':'+df['ss']
    #Convert df['Date'] to DateTime object
    #df['Date'] = datetime.datetime.strptime(df['Date'], '%Y/%m/%d %H:%M:%s.%f')
    df['Date'] = pd.to_datetime(df['Date'], format='%Y/%m/%d %H:%M:%S')
    #Index on date
    df.index = df['Date']
    #Drop now uneeded date & time columns
    df.drop(columns=['#YY','MM','DD','hh','mm','ss','Date'],inplace=True)
    #If more than 11 columns, drop extras
    extras = ['WDIR','WSPD','GST','PRES','ATMP','DEWP','VIS','PTDY','TIDE','MWD','WVHT','APD']
    if len(df.columns) > 11:
        df.drop(columns=extras, inplace=True)
    
    return df

met = timeFixer(met_readings)
directions = timeFixer(directions)

new_test1 = pd.read_csv('Test46251.txt', sep=' ', skipinitialspace=True)
new_test2 = pd.read_csv('test2.spec.txt', sep=' ', skipinitialspace=True)
nt1 = timeFixer(new_test1)
nt2 = timeFixer(new_test2)

display(met.head(1))
display(directions.head(1))

combined = pd.concat([met,directions],axis=1)
new_test = pd.concat([nt1,nt2],axis=1)

display(combined.head(2))
print('Combined shape:',combined.shape)
combined = combined.dropna()
new_test = new_test.dropna()
print('Combined shape minus rows with NaNs',combined.shape)

#Convert numericals to floats
numerical_cols = ['WVHT','DPD','WTMP','SwH','SwP','WWH','WWP','APD','MWD']
categorical_cols = ['SwD','WWD','STEEPNESS']
combined[numerical_cols] = combined[numerical_cols].astype('float')
new_test[numerical_cols] = new_test[numerical_cols].astype('float')

# #OHE steepness column (can just ohe)
# keywords = ['AVERAGE','SWELL','STEEP','VERY_STEEP']
# for col in keywords:
#     combined[col] = (combined['STEEPNESS']
#                     .fillna('')
#                     .str.lower()
#                     .str.contains(col, regex=True)
#                     .astype(int))

combined_comp = combined['STEEPNESS'].value_counts(normalize=True)
most_class = combined['STEEPNESS'].value_counts(normalize=True).max()
print('---')
print('Relative Frequency:',combined_comp)
print('---')
print('Proportion of most common class:',(most_class*100))
#The most common class is 'average' at 58%

#There are 45 days so 27/9/9 split
cutoff_train = '2021-04-01 00:00:00'
cutoff_val = '2021-04-10 00:00:00'
cutoff_test = '2021-04-20 00:00:00'
cutoff_new_test = '2021-4-20 00:00:00'

train = combined.loc[combined.index < cutoff_train]
combined = combined.loc[combined.index > cutoff_train]

val = combined.loc[combined.index < cutoff_val]
combined = combined.loc[combined.index > cutoff_val]

test = combined.loc[combined.index < cutoff_test]

test_new = new_test.loc[new_test.index>cutoff_test]

train_range = [train.index[1],train.index[-1]]
val_range = [val.index[1],val.index[-1]]
test_range = [test.index[1],test.index[-1]]
print('---')
print('Train:',train_range,'Length:',len(train))
print('Val:',val_range,'Length:',len(val))
print('Test:',test_range,'Length:',len(test))
print(len(test_new))

target = 'MWD'
X_train = train.drop(columns=target)
y_train = train[target]

X_val = val.drop(columns=target)
y_val = val[target]

X_test = test.drop(columns=target)
y_test = test[target]

X_new_test = new_test.drop(columns=target)
y_new_test = new_test[target]

print('---')
print('Train:',X_train.shape,y_train.shape,'Val:',X_val.shape,y_val.shape,'Test:',X_test.shape,y_test.shape)

print('---')
baseline_acc = y_train.mean()
y_pred = [y_train.mean()] * len(y_train)
print('Mean Wave Direction:',round(baseline_acc,2))
print('Mean Absolute Error of Naive Regressor:',mean_absolute_error(y_train,y_pred))



Unnamed: 0_level_0,DPD,WTMP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-04-19 20:26:00,14,16.1


Unnamed: 0_level_0,WVHT,SwH,SwP,WWH,WWP,SwD,WWD,STEEPNESS,APD,MWD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-04-19 20:26:00,1.3,0.7,14.3,1.1,6.7,SSW,W,AVERAGE,5.6,198


Unnamed: 0_level_0,DPD,WTMP,WVHT,SwH,SwP,WWH,WWP,SwD,WWD,STEEPNESS,APD,MWD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-03-05 00:26:00,10,14.9,1.1,0.6,12.5,0.9,9.9,W,W,AVERAGE,7.7,276
2021-03-05 00:56:00,11,14.4,1.1,0.6,10.5,0.9,9.9,W,W,AVERAGE,7.5,264


Combined shape: (1786, 12)
Combined shape minus rows with NaNs (1744, 12)
---
Relative Frequency: AVERAGE       0.583142
VERY_STEEP    0.198968
SWELL         0.112385
STEEP         0.105505
Name: STEEPNESS, dtype: float64
---
Proportion of most common class: 58.31422018348624
---
Train: [Timestamp('2021-03-05 00:56:00'), Timestamp('2021-03-31 23:56:00')] Length: 856
Val: [Timestamp('2021-04-01 00:56:00'), Timestamp('2021-04-09 23:56:00')] Length: 419
Test: [Timestamp('2021-04-10 00:56:00'), Timestamp('2021-04-19 20:26:00')] Length: 469
158
---
Train: (856, 11) (856,) Val: (419, 11) (419,) Test: (469, 11) (469,)
---
Mean Wave Direction: 255.24
Mean Absolute Error of Naive Regressor: 26.141649925757708


In [5]:
#Adaptive Boosting Regressor
model_abr = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    AdaBoostRegressor(random_state=42)
)
model_abr.fit(X_train,y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['SwD', 'WWD', 'STEEPNESS'],
                               use_cat_names=True)),
                ('adaboostregressor', AdaBoostRegressor(random_state=42))])

In [6]:
def check_metrics(model):
    print('---')
    print(model)
    print('Training MAE:', mean_absolute_error(y_train, model.predict(X_train)))
    print('Validation MAE:', mean_absolute_error(y_val, model.predict(X_val)))
    print('Validation R^2:', model.score(X_val,y_val))
    print()
    print()

    
models = [model_abr]

for m in models:
    check_metrics(m)

---
Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['SwD', 'WWD', 'STEEPNESS'],
                               use_cat_names=True)),
                ('adaboostregressor', AdaBoostRegressor(random_state=42))])
Training MAE: 4.937723735251001
Validation MAE: 5.8945584794076025
Validation R^2: 0.9549167976634525




In [7]:
#Real Prediction
y_pred = model_abr.predict(X_new_test)
test_mae = mean_absolute_error(y_new_test,model_abr.predict(X_new_test))
test_R2 = model_abr.score(X_new_test,y_new_test)

print(test_mae,test_R2)

5.288878571039399 0.9614769957410216


In [None]:
##Below is tidal stuff

In [176]:
#Working with tides
tides_dates_times = pd.read_csv('BW_MET46251.txt', sep=' ', skipinitialspace=True)
tides = pd.read_csv('9413745.txt',delim_whitespace=True)

tides_dates_times['Date'] = tides_dates_times['#YY']+'/'+tides_dates_times['MM']+'/'+tides_dates_times['DD']+' '+tides_dates_times['hh']+':'+tides_dates_times['mm']

#Now I need to convert both time formats into common datetime format expected by prophet, which I'm 
#Hoping to use to infill tidal values, though there is likely a fundamentally more sound way
#To accomplish this.  Tides need to convert to from 24 hour time to datetime.
#from datetime import datetime

tides['Time'] = tides['Day']+' '+tides['Time']
for i in tides.Time:
    tides['a'] = tides['Time'].str[:2].astype('int')
    tides['b'] = tides['Time'].str[3:5].astype('int')
    tides['c'] = tides['Time'].str[-2:]
    tides['d'] = tides['Time']
drop = ['Day','Time']
tides.drop(columns=drop,inplace=True)

a = tides.loc[tides['c']=='PM']
d = a.copy()
d['a'] = d['a']+12
b = tides.loc[tides['c']=='AM']
c = pd.concat([b,d])
#c['dt'] = c.index+' '+c['a'].astype('str').str[0:4]+':'+c['b'].astype('str')+':'+'00'
c['e'] = c.index+' '+c['d']
c['dt'] = pd.to_datetime(c['e'],infer_datetime_format=True)
c.index = c['dt']
drop=['a','b','c','d','dt','e']
c.drop(columns=drop,inplace=True)
c = c.sort_index()
c['time'] = c.index

c['time2'] = c['time'].shift(1)
c['time3'] = (c['time']-c['time2']).astype('timedelta64[h]')
c['Pred2'] = c['Pred'].shift(1)
c['per_dif'] = c['Pred']-c['Pred2']
c['del'] = c['per_dif'] / c['time3']



In [177]:
#To impute tidal data
t = []    
pp = []
p = []
for j in range(1,len(c)):    #For length of tide dataframe
    p = []
    for i in range(1,((c['time3'][j]).astype('int')-1)): #For the number of hours between observations j & j+1
        interval = datetime.timedelta(hours=i) #Add iterating hours
        t.append(c['time2'][j]+interval) #add to list of 6 datetimes
        p.append(abs(c['Pred'][j]+(c['del'][j]-(i)))) #Add prediction (prev pred + i * rate)
    p.reverse()
    pp.extend(p)
data = {'time':t,'Pred':pp}
mp = pd.DataFrame(data)
mp.index = mp['time']
mp.head(10)
cmp = pd.concat([mp,c])

#cmp.sort_index().head(5)

cmp_drop = ['Date','time2','time3','Pred2','per_dif','del']
cmp.drop(columns=cmp_drop,inplace=True)
cmp['day'] = cmp.index.day
cmp['time'] = cmp.index.hour
cmp = cmp.sort_values(['day','time'],ascending=[True,True])
cmp = cmp.reset_index()
cmp

Unnamed: 0,index,time,Pred,High/Low,day
0,2021-03-01 05:27:00,5,1.030000,L,1
1,2021-03-01 06:27:00,6,1.451667,,1
2,2021-03-01 07:27:00,7,2.451667,,1
3,2021-03-01 08:27:00,8,3.451667,,1
4,2021-03-01 09:27:00,9,4.451667,,1
...,...,...,...,...,...
564,2021-03-31 13:38:00,13,3.850000,H,31
565,2021-03-31 14:38:00,14,1.946000,,31
566,2021-03-31 15:38:00,15,0.946000,,31
567,2021-03-31 16:38:00,16,0.054000,,31


In [178]:
c_drop = ['time']
c.drop(columns=c_drop,inplace=True)
c['day'] = c.index.day
c['time'] = c.index.hour
c = c.reset_index()
c
ccmp = pd.concat([cmp,c])
ccmp = ccmp.sort_values(['day','time'],ascending=[True,True])
ccmp.reset_index()

ccmp['key'] = (ccmp['time']).astype('str') + (ccmp['day']).astype('str')
ccmp.drop(columns=['High/Low','dt','Date','time2','time3','Pred2','per_dif','del'],inplace=True)
ccmp.dropna(inplace=True)
ccmp

In [186]:
#I've imputed my tidal data, now I'll combine it into my df
#I'll just make a copy of the old one
X_train_tides = X_train.copy()
y_train_tides = y_train.copy()

X_train_tides['time'] = X_train_tides.index.hour
X_train_tides['day'] = X_train_tides.index.day
X_train_tides['key'] = (X_train_tides['time']).astype('str') + (X_train_tides['day']).astype('str')

X_train_tides_c = pd.merge(X_train_tides,ccmp,on='key',how='inner')
X_train_tides_c.drop_duplicates(inplace=True)
X_train_tides_c.index = X_train_tides_c['index']
X_train_tides_c.drop(columns=['time_x','day_x','key','index','time_y','day_y'],inplace=True)
X_train_tides_c

Unnamed: 0_level_0,DPD,WTMP,WVHT,SwH,SwP,WWH,WWP,SwD,WWD,STEEPNESS,APD,Pred
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-03-05 02:32:00,10.0,13.8,1.1,0.6,10.5,0.9,9.9,W,W,AVERAGE,7.3,5.170000
2021-03-05 02:32:00,11.0,13.8,1.1,0.6,10.5,0.9,7.7,W,WSW,AVERAGE,7.1,5.170000
2021-03-05 03:32:00,11.0,13.8,1.0,0.5,11.1,0.9,9.1,WSW,W,AVERAGE,6.6,5.612857
2021-03-05 03:32:00,11.0,13.8,1.0,0.6,11.1,0.8,9.9,W,W,AVERAGE,6.4,5.612857
2021-03-05 04:32:00,13.0,13.8,1.0,0.6,12.5,0.9,9.9,WSW,W,AVERAGE,6.5,4.612857
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-03-31 15:38:00,13.0,13.2,0.9,0.6,12.5,0.7,8.3,W,W,AVERAGE,7.3,0.946000
2021-03-31 16:38:00,15.0,13.3,0.9,0.6,15.4,0.7,9.1,SSW,W,AVERAGE,7.6,0.054000
2021-03-31 16:38:00,15.0,13.4,0.8,0.5,15.4,0.6,9.1,SSW,WNW,AVERAGE,7.4,0.054000
2021-03-31 18:49:00,18.0,13.8,0.9,0.6,18.2,0.6,7.1,SSW,W,SWELL,8.5,1.520000


In [None]:
#Now that I've finally fitted in my tidal data, I don't really want to do the same thing
#For the validation and test sets.  I could probably do them together (data leak?)
#What I should do is make an advanced wrangle function that does all the steps automatically
#It woudln't even take that long

#For now, I'm going to focus on some visualizations to satisfy the project constraints
