# Predicting Rain ☔ in Australia 🦘

#### If you like my work, It will be really great of you to upvote this notebook!
#### If not then you leaving a comment on what do I need to work on and improve will be really helpful!

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.simplefilter("ignore")

## Loading up the data

In [None]:
df = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")
df.head()

In [None]:
df.shape

In [None]:
# Looking for missing values in the dataset
df.isna().sum()

In [None]:
# Dropping the missing values from the dataset
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df["Date"] = pd.to_datetime(df["Date"], format = "%Y-%m-%d", errors = "coerce")

In [None]:
df.dtypes

In [None]:
df.describe().T

In [None]:
# Having a look at the correlation matrix

fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, fmt='.1g', cmap="viridis", cbar=False);

In [None]:
# Mapping Yes:1, No:0

df['RainTomorrow'] = df['RainTomorrow'].map({'Yes': 1, 'No': 0})
df['RainToday'] = df['RainToday'].map({'Yes': 1, 'No': 0})

In [None]:
df.dtypes

In [None]:
fig = px.line(df,x = 'Date', y = ['Rainfall'], template = 'plotly_dark')
fig.show()

In [None]:
fig = px.line(df, x = "Date", y = ["Evaporation"], template = 'plotly_dark')
fig.show()

In [None]:
plt.style.use("classic")
fig, ax =plt.subplots(1,2, figsize=(15,8))
sns.countplot(df["RainToday"], ax=ax[0], palette="Blues");
sns.countplot(df["RainTomorrow"], ax=ax[1], palette="Blues");

In [None]:
plt.style.use("classic")
fig, ax =plt.subplots(3,1, figsize=(20,20))

sns.countplot(df['WindDir9am'], ax=ax[0])
ax[0].set_xlabel("WindDir9am",fontsize=15)

sns.countplot(df['WindDir3pm'], ax=ax[1])
ax[1].set_xlabel("WindDir3pm",fontsize=15)

sns.countplot(df['WindGustDir'], ax=ax[2])
ax[2].set_xlabel("WindGustDir",fontsize=15);

* At 9 AM, The wind direction is mostly `NORTH`
* At 3 PM, The wind direction is mostly `SOUTH EAST`

In [None]:
plt.style.use("classic")
fig, ax =plt.subplots(2,1, figsize=(20,20))

sns.countplot(df['WindSpeed9am'], ax=ax[0])
ax[0].set_xlabel("WindSpeed9am",fontsize=15)

sns.countplot(df['WindSpeed3pm'], ax=ax[1])
ax[1].set_xlabel("WindSpeed3pm",fontsize=15);

* At 9 AM, The wind speed is `13.0`
* At 3 PM, The wind speed is `17.0`

In [None]:
plt.style.use("classic")
fig, ax =plt.subplots(2,1, figsize=(20,20))

sns.distplot(df['Humidity9am'], ax=ax[0], color="orange", bins=40)
ax[0].set_xlabel("Humidity9am",fontsize=15)

sns.distplot(df['Humidity3pm'], ax=ax[1], color="orange", bins=40);
ax[1].set_xlabel("Humidity3pm",fontsize=15);

In [None]:
plt.style.use("classic")
fig, ax =plt.subplots(2,1, figsize=(20,20))

sns.distplot(df['Pressure9am'], ax=ax[0], color="r", bins=40)
ax[0].set_xlabel("Pressure9am",fontsize=15)

sns.distplot(df['Pressure3pm'], ax=ax[1], color="r", bins=40)
ax[1].set_xlabel("Pressure3pm",fontsize=15);

In [None]:
plt.style.use("classic")
fig, ax =plt.subplots(2,1, figsize=(20,20))

sns.countplot(df['Cloud9am'], ax=ax[0])
ax[0].set_xlabel("Cloud9am",fontsize=15)

sns.countplot(df['Cloud3pm'], ax=ax[1])
ax[1].set_xlabel("Cloud3pm",fontsize=15);

In [None]:
plt.style.use("classic")
fig, ax =plt.subplots(2,1, figsize=(20,20))

sns.distplot(df['Temp9am'], ax=ax[0], color="goldenrod", bins=40)
ax[0].set_xlabel("Temp9am",fontsize=15)

sns.distplot(df['Temp3pm'], ax=ax[1], color="goldenrod", bins=40)
ax[1].set_xlabel("Temp3pm",fontsize=15);

In [None]:
df.dtypes

## Label Encoding the non-numeric variables

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
df['Location'] = le.fit_transform(df['Location'])
df['WindDir9am'] = le.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = le.fit_transform(df['WindDir3pm'])
df['WindGustDir'] = le.fit_transform(df['WindGustDir'])

In [None]:
df.drop("Date", axis=1, inplace=True)

In [None]:
# Having a look at the data types after the numerical transformation
df.dtypes

## Splitting the data into training and test datasets
Here, we are trying to predict whether it is going to Rain tomorrow or not in Australia using the given data. Hence, the `RainTomorrow` will be the y label and rest of the data will be the X or the input data.

In [None]:
# X data
X = df.drop("RainTomorrow", axis=1)

In [None]:
X.head()

In [None]:
# y data
y = df["RainTomorrow"]
y.head()

In [None]:
# Splitting the data into X train, X test and y train, y test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

## Training the Model

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
LinearRegressionScore = lr.score(X_test,y_test)
print("Accuracy obtained by Linear Regression model:",LinearRegressionScore*100)

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 100, random_state = 0)
rf.fit(X_train,y_train)

In [None]:
RandomForestRegressorScore = rf.score(X_test,y_test)
print("Accuracy obtained by Random Forest Regressor model:",RandomForestRegressorScore*100)

## KNeighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(4)
knn.fit(X_train,y_train)

In [None]:
KNeighborsClassifierScore = rf.score(X_test,y_test)
print("Accuracy obtained by KNeighbors Classifier model:", KNeighborsClassifierScore*100)

## CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(iterations=10)
cat.fit(X_train, y_train);

In [None]:
CatBoostClassifierScore = cat.score(X_test,y_test)
print("Accuracy obtained by CatBoost Classifier model:",CatBoostClassifierScore*100)

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [None]:
GradientBoostingClassifierScore = gb.score(X_test,y_test)
print("Accuracy obtained by Gradient Boosting Classifier model:",GradientBoostingClassifierScore*100)

## Stochastic Gradient Boosting

In [None]:
sgb = GradientBoostingClassifier(subsample = 0.90, max_features = 0.70)
sgb.fit(X_train, y_train)

In [None]:
StochasticGradientBoostingScore = sgb.score(X_test,y_test)
print("Accuracy obtained by Stochastic Gradient Boosting model:", StochasticGradientBoostingScore*100)

## Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

In [None]:
ExtraTreesClassifierScore = sgb.score(X_test,y_test)
print("Accuracy obtained by Extra Trees Classifier model:", ExtraTreesClassifierScore*100)

## XGB Classifier

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 180)
xgb.fit(X_train, y_train);

In [None]:
XGBClassifierScore = sgb.score(X_test,y_test)
print("Accuracy obtained by XGB Classifier model:", XGBClassifierScore*100)

## Comparing performance of the models

In [None]:
x = ["CatBoost Classifier", 
     "Extra Trees Classifier",
     "XGB Classifier",
     "Gradient Boosting Classifier", 
     "Stochastic Gradient Boosting"]

y = [CatBoostClassifierScore, 
     ExtraTreesClassifierScore,
     XGBClassifierScore,
     GradientBoostingClassifierScore, 
     StochasticGradientBoostingScore]

plt.style.use("classic")
fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(x=x,y=y, palette="crest");
plt.ylabel("Model Accuracy")
plt.xticks(rotation=40)
plt.title("Model Comparison - Model Accuracy");