## Library Import
NumPy, pandas, matplotlib, seaborn

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Import Dataset

In [5]:
data = pd.read_csv("data/weatherAUS.csv")
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


Todo things:    
Missing data  
Outliers  
TimeSeries  

In [6]:
print("Data type : ", type(data))
print("Data dims : ", data.shape)
print(data.dtypes)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (145460, 23)
Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object


## Cleaning the dataset

* Removing duplicate
* Handling missing values
* Detect and handle outliers

In [7]:
# Removing duplicates
data.drop_duplicates()
# After removing
print("Data dims : ", data.shape)

Data dims :  (145460, 23)


No duplicates were found

The extend of missing data is severe, thus imputation is applied to preserve data. Since the data is TimeSeries, we are using interpolation for severely missing variables. Missing values from RainTomorrow is dropped. For variables with less missing values we use median and mode imputation.

In [8]:
# Define columns for imputation strategies
columns_large_missing = ['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
columns_few_missing_num = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 
                           'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 
                           'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']
columns_few_missing_cat = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

# Remove rows where 'RainTomorrow' is missing
data_cleaned = data.dropna(subset=['RainTomorrow'])

# Convert 'Date' to datetime and set as index for interpolation
data_cleaned['Date'] = pd.to_datetime(data_cleaned['Date'])
data_cleaned.set_index('Date', inplace=True)

# Apply interpolation for columns with large amounts of missing data
data_cleaned[columns_large_missing] = data_cleaned[columns_large_missing].interpolate(method='time')

# Reset index
data_cleaned.reset_index(inplace=True)

# Impute missing values for columns with fewer missing values
# Numerical columns: Using median
data_cleaned[columns_few_missing_num] = data_cleaned[columns_few_missing_num].apply(lambda x: x.fillna(x.median()))

# Categorical columns: Using mode
data_cleaned[columns_few_missing_cat] = data_cleaned[columns_few_missing_cat].apply(lambda x: x.fillna(x.mode()[0]))

# Final imputation for the few remaining missing values in 'Evaporation', 'Sunshine', and 'Cloud3pm'
data_cleaned['Evaporation'] = data_cleaned['Evaporation'].fillna(data_cleaned['Evaporation'].median())
data_cleaned['Sunshine'] = data_cleaned['Sunshine'].fillna(data_cleaned['Sunshine'].median())
data_cleaned['Cloud3pm'] = data_cleaned['Cloud3pm'].fillna(data_cleaned['Cloud3pm'].mode()[0])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['Date'] = pd.to_datetime(data_cleaned['Date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned[columns_large_missing] = data_cleaned[columns_large_missing].interpolate(method='time')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned[columns_few_missing_num] = data

In [5]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Assuming 'rain' is your DataFrame
# Identify numerical and categorical columns
numerical_cols = rain.select_dtypes(include=['float64']).columns.difference(['Cloud9am', 'Cloud3pm'])
categorical_cols = rain.select_dtypes(include=['object']).columns.tolist() + ['Cloud9am', 'Cloud3pm']

# Separate the DataFrame into numerical and categorical DataFrames
rain_numerical = rain[numerical_cols]
rain_categorical = rain[categorical_cols]

# Apply KNN Imputation to numerical data
# It's a good practice to scale numerical data before applying KNN imputation
scaler = MinMaxScaler()
rain_numerical_scaled = scaler.fit_transform(rain_numerical)
knn_imputer = KNNImputer(n_neighbors=5)
rain_numerical_imputed_scaled = knn_imputer.fit_transform(rain_numerical_scaled)
# Inverse transform to original scale after imputation
rain_numerical_imputed = scaler.inverse_transform(rain_numerical_imputed_scaled)
rain_numerical_imputed = pd.DataFrame(rain_numerical_imputed, columns=numerical_cols, index=rain_numerical.index)

# Apply Mode Imputation to categorical data
for col in categorical_cols:
    mode_value = rain_categorical[col].mode()[0]
    rain_categorical[col].fillna(mode_value, inplace=True)

# Merge the imputed numerical and categorical data back into a single DataFrame
rain_imputed = pd.concat([rain_numerical_imputed, rain_categorical], axis=1)

# Ensuring the original column order is preserved
rain_imputed = rain_imputed[rain.columns]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rain_categorical[col].fillna(mode_value, inplace=True)


## Exploratory Analysis

In [10]:
data_cleaned.head()
print("Data dims : ", data.shape)
print(data.dtypes)

Data dims :  (145460, 23)
Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object


In [19]:
# Export cleaned data 
rain_encoded.to_csv('data/weatherAUS_cleaned.csv', index=False)

## Outlier detection

## More EDA

## Feature selection

## Exporting data