# Getting Started

In [200]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

from sklearn.metrics import f1_score

In [201]:
data = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")

In [202]:
data

In [203]:
pd.set_option('display.max_columns', None)

In [204]:
# Drop the date as it doesn't effect the model 
data.drop('Date', axis=1, inplace=True)

# Data Preprocessing

In [205]:
# find how many missing value in the datasets
data.isnull().sum()

In [206]:
# the type of data
data.dtypes

In [207]:
# Check the null value 
data['RainToday'].unique()

In [208]:
# Fill RainToday nan to be no as the model need that data to predict 
data['RainToday'] = data['RainToday'].fillna('No')
data['RainTomorrow'] = data['RainTomorrow'].fillna('No')

In [209]:
# use label encoder to encode RainToday and RainTomorrow labels with value between 0 and 1
encoder = LabelEncoder()

label_encoder_columns = ['RainToday', 'RainTomorrow']

for column in label_encoder_columns:
    data[column] = encoder.fit_transform(data[column])

In [210]:
# add prefict 
def add_column_prefixes(data, column, prefix):
    return data[column].apply(lambda x: prefix + str(x))

In [211]:
# add infornt of wind direction labels
data['WindDir9am'] = add_column_prefixes(data, 'WindDir9am', "9_")
data['WindDir3pm'] = add_column_prefixes(data, 'WindDir3pm', "3_")

In [212]:
#check the dataset
data 

In [213]:
# use get dummies assign unqniue value to have a column 
pd.get_dummies(data['WindGustDir'])

In [214]:
# use one-hot encoding to covert categorical data to numbers 
def onehot_encoder(data, columns):
    for column in columns:
        dummies = pd.get_dummies(data[column])
        data = pd.concat([data, dummies], axis=1)
        data.drop(column, axis=1, inplace=True)
    return data

In [215]:
# apply hot-one encoding to location , windgustdirection , winddirection9am and winddirection3pm 
categorical_features = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']

data = onehot_encoder(data, categorical_features)

In [216]:
# find out there are still any null value or not 
data.isnull().sum()

In [217]:
# since there are still null value, fill the null value with the mean of the datasets 
def impute_means(data, columns):
    for column in columns:
        data[column] = data[column].fillna(data[column].mean())

In [218]:
data.isnull().sum()

In [219]:
na_columns = ['MinTemp',
              'MaxTemp',
              'Rainfall',
              'Evaporation',
              'Sunshine',
              'WindGustSpeed',
              'WindSpeed9am',
              'WindSpeed3pm',
              'Humidity9am',
              'Humidity3pm',
              'Pressure9am',
              'Pressure3pm',
              'Cloud9am',
              'Cloud3pm',
              'Temp9am',
              'Temp3pm']

impute_means(data, na_columns)

In [220]:
# check again there are null value or not 
data.isnull().sum()

# All the data is well prepared to train and test the model

In [221]:
#slpit the datasets into X axis and Y axis to train the model 
y = data['RainTomorrow']
X = data.drop('RainTomorrow', axis=1)

In [222]:
y

In [223]:
X

In [224]:
# use robustscaler since datasets in X have significant difference among the value to remove the outliers
scaler = RobustScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [225]:
# start traning data 
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [230]:
# set the argument for the model  
inputs = tf.keras.Input(shape=(116,))
x = tf.keras.layers.Dense(16, activation='relu')(inputs)
x = tf.keras.layers.Dense(16, activation='relu')(x)
outputs = tf.keras.layers.Dense(2, activation='softmax')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

# compile the training model 
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# set the epoch and batch size 
batch_size = 32
epochs = 6

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[tf.keras.callbacks.ReduceLROnPlateau()],
    verbose=1
)

In [227]:
plt.figure(figsize=(14, 10))

plt.plot(range(EPOCHS), history.history['loss'], color='b')
plt.plot(range(EPOCHS), history.history['val_loss'], color='r')

plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.show()

In [231]:
# determine which epoch is the lowest
np.argmin(history.history['val_loss'])

# Result of this project

In [232]:
#result
print(f"Model Accuracy: {model.evaluate(X_test, y_test, verbose=0)[1]}")