In [11]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, Normalization
import pandas as pd
from datetime import date
import xgboost as xgb
import numpy as np

# Plan is to predict whether or not it was a rain day based on the temperature, cloud coverage and humidity. 

df = pd.read_csv('isRainDay3.csv')
df = df.drop(columns=[" dow"," ObsCode"," dcnn"," hol"," day", "year"]) # Drop unnecessary columns
df = df.rename(columns={"N8": "Cloud Cover", "RH": "Relative Humidity", "rd": "Rain Day",
                        "sss": "Hours of Sun", "Pstn": "Pressure"}, errors="raise")
df = df.loc[(df["Relative Humidity"] != "x") & (df["Cloud Cover"] != "OBS") & (df["Cloud Cover"] != "x")
           & (df["Pressure"] != "x") & (df["Hours of Sun"] != "x")]
df = df.astype({"Cloud Cover":"int64", "Relative Humidity":"int64", "Tdry":"float",
                "Pressure":"float", "Rain Day": "int64", "Hours of Sun": "float"})
# ^Data Sanitisation^
print(df)

isRainDay = df[["Rain Day"]]
trainingData = df
trainingData.pop("Rain Day") # Creates list of data we can use for training that does not say whether or not it is a rain day
tensorTraining = tf.convert_to_tensor(trainingData)

        month  Pressure  Tdry  Relative Humidity  Cloud Cover  Rain Day  \
18628       1    1012.3   5.2                 89            2         1   
18629       1     994.0   5.1                 80            5         1   
18630       1    1008.2   2.1                 96            3         1   
18631       1    1006.4   0.3                 94            0         0   
18632       1    1014.8   0.4                 95            8         1   
...       ...       ...   ...                ...          ...       ...   
42077       3    1011.9   3.0                 91            6         1   
42078       3    1001.7  10.0                 86            8         0   
42079       3     998.7  11.0                 91            8         1   
42080       3    1000.0  10.0                 93            8         1   
42081       3    1010.3   7.5                 72            4         0   

       Hours of Sun  
18628           1.0  
18629           5.1  
18630           4.5  
18631      

In [2]:
normalizer = Normalization(axis=-1)
normalizer.adapt(tensorTraining)
normalizer(tensorTraining[0:])

<tf.Tensor: shape=(21839, 6), dtype=float32, numpy=
array([[-1.5969832 ,  0.11630779, -0.8705471 ,  0.6498048 , -1.3959647 ,
        -0.81615394],
       [-1.5969832 , -1.5530978 , -0.8873595 , -0.06319419, -0.28252012,
         0.21601918],
       [-1.5969832 , -0.25771007, -1.3917321 ,  1.2043595 , -1.0248165 ,
         0.06496948],
       ...,
       [-1.0180241 , -1.124342  ,  0.10457302,  0.808249  ,  0.8309245 ,
        -0.23712999],
       [-1.0180241 , -1.0057514 , -0.06355114,  0.9666932 ,  0.8309245 ,
        -0.5895793 ],
       [-1.0180241 , -0.06614104, -0.48386154, -0.69697106, -0.65366834,
        -0.7406291 ]], dtype=float32)>

In [24]:
def getModel(): # Fairly heavily modified code from https://www.tensorflow.org/tutorials/load_data/pandas_dataframe
    model = keras.Sequential([normalizer, Dense(10, activation="relu"), Dense(1)])
    
    model.compile(optimizer="rmsprop", loss="Hinge", metrics=["accuracy"])
    return model

startTime = time.time()
model = getModel()
model.fit(tensorTraining, isRainDay, epochs=15, batch_size=4000)
predicts = model.predict(tensorTraining)
predicts
endTime = time.time()
print(f"Time Taken:{endTime-startTime}")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Time Taken:1.5222244262695312


In [22]:
import time

xgtrain = xgb.DMatrix(df, label=isRainDay)
xgtest = xgb.DMatrix(df, label=isRainDay)
evallist = [(xgtrain, "train"), (xgtest, "eval")]

rounds = 15
parameters = {"max_depth": 25, "objective": "binary:hinge", "subsample": 0.25}

startTime = time.time()
training = xgb.train(parameters, xgtrain, rounds, evallist)
endTime = time.time()
print(f"Time Taken:{endTime-startTime}")

[0]	train-error:0.56546	eval-error:0.56546
[1]	train-error:0.32181	eval-error:0.32181
[2]	train-error:0.28463	eval-error:0.28463
[3]	train-error:0.23696	eval-error:0.23696
[4]	train-error:0.21594	eval-error:0.21594
[5]	train-error:0.19790	eval-error:0.19790
[6]	train-error:0.18421	eval-error:0.18421
[7]	train-error:0.17519	eval-error:0.17519
[8]	train-error:0.16576	eval-error:0.16576
[9]	train-error:0.16054	eval-error:0.16054
[10]	train-error:0.15170	eval-error:0.15170
[11]	train-error:0.14625	eval-error:0.14625
[12]	train-error:0.14057	eval-error:0.14057
[13]	train-error:0.13499	eval-error:0.13499
[14]	train-error:0.12986	eval-error:0.12986
Time Taken:0.37100839614868164


In [5]:
# Doesn't work, keeping around just in case i think of a use for it
df.insert(0, "Rain Day", isRainDay)
trainingSet = df.sample(frac = 0.75)
testingSet = pd.merge(df, trainingSet, indicator=True, how="outer")\
            .query("_merge=='left_only'").drop("_merge", axis=1)
print(trainingSet)

       Rain Day   month  Pressure  Tdry  Relative Humidity  Cloud Cover  \
34159         1       7    1002.2  17.1                 79            6   
29366         1       5    1005.0  12.8                 91            8   
19602         0       9    1022.6  16.3                 87            8   
23217         0       7    1011.6  18.1                 79            7   
28832         1      12    1009.6   5.4                 75            1   
...         ...     ...       ...   ...                ...          ...   
32008         0       8    1018.1  23.4                 50            2   
35290         0       8    1014.4  17.6                 86            8   
37049         1       6    1002.2  10.8                 80            8   
34866         0       6    1007.2  21.0                 66            8   
40330         0       6    1014.0  17.5                 80            7   

       Hours of Sun  
34159           3.4  
29366           4.1  
19602           7.9  
23217      