## Library Import
NumPy, pandas, matplotlib, seaborn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Import Dataset

In [2]:
df = pd.read_csv("data/weatherAUS.csv")
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


Todo things:    
Missing data  
Outliers  
TimeSeries  

In [3]:
print("Data type : ", type(rain))
print("Data dims : ", rain.shape)
print(rain.dtypes)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (145460, 23)
Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object


## Cleaning the dataset

* Removing duplicate
* Handling missing values
* Correct data types, for `object` data types, use one hot encoding, for Date, use convert to Date types

In [4]:
# Removing duplicates
rain.drop_duplicates()
# After removing
print("Data dims : ", rain.shape)

Data dims :  (145460, 23)


No duplicates were found

The extend of missing data is severe, thus imputation is applied to preserve data. Since the data is TimeSeries, we are using interpolation for severely missing variables. Missing values from RainTomorrow is dropped. For variables with less missing values we use median and mode imputation.

In [5]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Assuming 'rain' is your DataFrame
# Identify numerical and categorical columns
numerical_cols = rain.select_dtypes(include=['float64']).columns.difference(['Cloud9am', 'Cloud3pm'])
categorical_cols = rain.select_dtypes(include=['object']).columns.tolist() + ['Cloud9am', 'Cloud3pm']

# Separate the DataFrame into numerical and categorical DataFrames
rain_numerical = rain[numerical_cols]
rain_categorical = rain[categorical_cols]

# Apply KNN Imputation to numerical data
# It's a good practice to scale numerical data before applying KNN imputation
scaler = MinMaxScaler()
rain_numerical_scaled = scaler.fit_transform(rain_numerical)
knn_imputer = KNNImputer(n_neighbors=5)
rain_numerical_imputed_scaled = knn_imputer.fit_transform(rain_numerical_scaled)
# Inverse transform to original scale after imputation
rain_numerical_imputed = scaler.inverse_transform(rain_numerical_imputed_scaled)
rain_numerical_imputed = pd.DataFrame(rain_numerical_imputed, columns=numerical_cols, index=rain_numerical.index)

# Apply Mode Imputation to categorical data
for col in categorical_cols:
    mode_value = rain_categorical[col].mode()[0]
    rain_categorical[col].fillna(mode_value, inplace=True)

# Merge the imputed numerical and categorical data back into a single DataFrame
rain_imputed = pd.concat([rain_numerical_imputed, rain_categorical], axis=1)

# Ensuring the original column order is preserved
rain_imputed = rain_imputed[rain.columns]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rain_categorical[col].fillna(mode_value, inplace=True)


Exploratory Analysis

In [8]:
rain_imputed.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,3.84,7.76,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,7.0,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,5.04,9.82,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,7.0,7.0,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,4.52,9.5,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,7.0,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,3.52,8.54,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,7.0,7.0,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,4.44,10.46,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [14]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_columns_label_encoding(df, columns_to_encode):
    """
    Applies Label Encoding to specified categorical columns in the DataFrame, 
    retains all other columns unchanged, and prints the mapping from original 
    categorical values to encoded numeric codes.
    
    Parameters:
    - df: pandas DataFrame containing the columns to encode.
    - columns_to_encode: list of strings, names of the categorical columns to be encoded.
    
    Returns:
    - A new DataFrame with specified columns label encoded and all other columns unchanged.
    """
    # Copy the DataFrame to avoid modifying the original data
    encoded_df = df.copy()
    
    # Initialize LabelEncoder
    encoder = LabelEncoder()
    
    # Loop through each column to encode
    for column in columns_to_encode:
        # Apply LabelEncoder and replace the column in the DataFrame
        encoded_df[column] = encoder.fit_transform(df[column])
        
        # Print the mappings for each encoded column
        mappings = {index: label for index, label in enumerate(encoder.classes_)}
        print(f"Encoding mapping for '{column}': {mappings}")
    
    return encoded_df

# Assuming 'rain_imputed' is your DataFrame
columns_to_encode = ['RainToday', 'RainTomorrow', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Location']

# Apply the encoding function and get the new DataFrame with specified categorical columns encoded
rain_encoded = encode_categorical_columns_label_encoding(rain_imputed, columns_to_encode)

Encoding mapping for 'RainToday': {0: 'No', 1: 'Yes'}
Encoding mapping for 'RainTomorrow': {0: 'No', 1: 'Yes'}
Encoding mapping for 'WindGustDir': {0: 'E', 1: 'ENE', 2: 'ESE', 3: 'N', 4: 'NE', 5: 'NNE', 6: 'NNW', 7: 'NW', 8: 'S', 9: 'SE', 10: 'SSE', 11: 'SSW', 12: 'SW', 13: 'W', 14: 'WNW', 15: 'WSW'}
Encoding mapping for 'WindDir9am': {0: 'E', 1: 'ENE', 2: 'ESE', 3: 'N', 4: 'NE', 5: 'NNE', 6: 'NNW', 7: 'NW', 8: 'S', 9: 'SE', 10: 'SSE', 11: 'SSW', 12: 'SW', 13: 'W', 14: 'WNW', 15: 'WSW'}
Encoding mapping for 'WindDir3pm': {0: 'E', 1: 'ENE', 2: 'ESE', 3: 'N', 4: 'NE', 5: 'NNE', 6: 'NNW', 7: 'NW', 8: 'S', 9: 'SE', 10: 'SSE', 11: 'SSW', 12: 'SW', 13: 'W', 14: 'WNW', 15: 'WSW'}
Encoding mapping for 'Location': {0: 'Adelaide', 1: 'Albany', 2: 'Albury', 3: 'AliceSprings', 4: 'BadgerysCreek', 5: 'Ballarat', 6: 'Bendigo', 7: 'Brisbane', 8: 'Cairns', 9: 'Canberra', 10: 'Cobar', 11: 'CoffsHarbour', 12: 'Dartmoor', 13: 'Darwin', 14: 'GoldCoast', 15: 'Hobart', 16: 'Katherine', 17: 'Launceston', 18:

In [15]:
rain_encoded.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,2,13.4,22.9,0.6,3.84,7.76,13,44.0,13,...,71.0,22.0,1007.7,1007.1,8.0,7.0,16.9,21.8,0,0
1,2008-12-02,2,7.4,25.1,0.0,5.04,9.82,14,44.0,6,...,44.0,25.0,1010.6,1007.8,7.0,7.0,17.2,24.3,0,0
2,2008-12-03,2,12.9,25.7,0.0,4.52,9.5,15,46.0,13,...,38.0,30.0,1007.6,1008.7,7.0,2.0,21.0,23.2,0,0
3,2008-12-04,2,9.2,28.0,0.0,3.52,8.54,4,24.0,9,...,45.0,16.0,1017.6,1012.8,7.0,7.0,18.1,26.5,0,0
4,2008-12-05,2,17.5,32.3,1.0,4.44,10.46,13,41.0,1,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0


In [19]:
# Export cleaned data 
rain_encoded.to_csv('data/weatherAUS_cleaned.csv', index=False)

Outlier detection

More EDA

Feature selection

Exporting data