<a href="https://colab.research.google.com/github/SSenitha/Data_Analytics/blob/Sandaru's/masterNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Analysis of Rain in Australia
This notebook contains all the steps of the analysis: From data cleaning and preprocessing to the conclusions.

In [40]:
# Load data from the repository
!wget https://raw.githubusercontent.com/SSenitha/Data_Analytics/main/weatherAUS.csv

--2025-07-14 13:15:15--  https://raw.githubusercontent.com/SSenitha/Data_Analytics/main/weatherAUS.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14094055 (13M) [text/plain]
Saving to: ‘weatherAUS.csv.3’


2025-07-14 13:15:15 (214 MB/s) - ‘weatherAUS.csv.3’ saved [14094055/14094055]



In [41]:
#Import necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import stats

#Data Cleaning

In [52]:
# Read the dataset and output the total count
df = pd.read_csv('/content/weatherAUS.csv')
print(f"Total rows count: {df['RainToday'].count()}")
df.head()

Total rows count: 142199


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


##1. De-duplication

In [43]:
# Check for duplicate rows and removing, keeping the first occurrence
print(df.duplicated().sum())
df_cleaned = df.drop_duplicates()

0


##2. Null Value Handeling

In [44]:
# Check for the missing values
missing_values_count = df.isnull().sum()
print(missing_values_count)

# Calculate the percent of data that is missing
total_cells = np.prod(df.shape)
total_missing = missing_values_count.sum()

percent_missing = (total_missing/total_cells) * 100
print(f"Percentage of missing values: {percent_missing}")

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64
Percentage of missing values: 10.259745694319072


In [45]:
df_cleaned = df

# Remove the rows that has missing result
df_cleaned.dropna(subset=['RainToday'], inplace=True)
df_cleaned.dropna(subset=['RainTomorrow'], inplace=True)
#print(df.head())

# Check the remaining value count
print(f"Total rows Left: {df['RainToday'].count()}")

Total rows Left: 140787


In [46]:
numerical_cols = df.select_dtypes(include=['float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

for col in categorical_cols:
    df[col] = df[col].fillna('')

##3. Standardization

In [47]:
# convert the date column into a datetime object
df_cleaned['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")

# extract the day, month, and year components
df_cleaned['Day'] = df_cleaned['Date'].dt.day
df_cleaned['Month'] = df_cleaned['Date'].dt.month
df_cleaned['Year'] = df_cleaned['Date'].dt.year

In [48]:
#Removing unwanted columns
df_cleaned.drop(['Date'], axis=1, inplace=True)
df_cleaned.drop(['WindGustDir'], axis=1, inplace=True)
df_cleaned.drop(['WindDir9am'], axis=1, inplace=True)
df_cleaned.drop(['WindDir3pm'], axis=1, inplace=True)

In [49]:
# Map binary variables
df_cleaned['RainTomorrow'] = df_cleaned['RainTomorrow'].map({'Yes': 1, 'No': 0})
df_cleaned['RainToday'] = df_cleaned['RainToday'].map({'Yes': 1, 'No': 0})

In [50]:
df_cleaned.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Day,Month,Year
0,Albury,13.4,22.9,0.6,4.8,8.5,44.0,20.0,24.0,71.0,...,1007.1,8.0,5.0,16.9,21.8,0,0,1,12,2008
1,Albury,7.4,25.1,0.0,4.8,8.5,44.0,4.0,22.0,44.0,...,1007.8,5.0,5.0,17.2,24.3,0,0,2,12,2008
2,Albury,12.9,25.7,0.0,4.8,8.5,46.0,19.0,26.0,38.0,...,1008.7,5.0,2.0,21.0,23.2,0,0,3,12,2008
3,Albury,9.2,28.0,0.0,4.8,8.5,24.0,11.0,9.0,45.0,...,1012.8,5.0,5.0,18.1,26.5,0,0,4,12,2008
4,Albury,17.5,32.3,1.0,4.8,8.5,41.0,7.0,20.0,82.0,...,1006.0,7.0,8.0,17.8,29.7,0,0,5,12,2008


##4. Normalization

##5. Verification