In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1. Pearson Correlation based Feature Selection

In [36]:
def pearson_correlation_fs(_df, cls, threshold_corr=0.8):
    """
    function to check correlation of each pair of features a
    and discard the one from the pair with corr > 'threshold_corr' 
    among the pair, the one with lower corr with the 'cls' is dropped 
    """
    
    df = _df.copy()
    
    corr_matrix = df.corr()
    cols_to_drop = set() # keep only unique features
    
    # get the class column index
    for idx in range(len(corr_matrix.columns)):
        if corr_matrix.columns[idx]==cls :
            cls_col_idx = idx
            break
    
    # find the features to drop
    for col1_idx in range(len(corr_matrix.columns)):
        for col2_idx in range(col1_idx):
            if corr_matrix.columns[col1_idx] == cls or corr_matrix.columns[col2_idx] == cls:
                continue
                
            if abs(corr_matrix.iloc[col1_idx, col2_idx]) > threshold_corr:
                if abs(corr_matrix.iloc[col1_idx, cls_col_idx]) < abs(corr_matrix.iloc[col2_idx, cls_col_idx]): 
                    col_to_drop = corr_matrix.columns[col1_idx] 
                else:
                    col_to_drop = corr_matrix.columns[col2_idx]
                
                print(f'dropping {col_to_drop} from ({corr_matrix.columns[col1_idx]}, {corr_matrix.columns[col2_idx]})')
                
                cols_to_drop.add(col_to_drop)
    
    cols_to_drop = list(cols_to_drop)
    _df.drop(columns=cols_to_drop)
    
    return _df, cols_to_drop

In [7]:
weather_avg_train_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Weather-WaterLevel-Prediction-ML/main/Datasets/brri-datasets/final-dataset/train/brri-weather_avg_train_regression.csv')
weather_avg_train_df.sample(5)

Unnamed: 0,Month,Avg Min Temp. (degree Celcius),Avg Max Temp. (degree Celcius),Avg Rainfall (mm),Avg Actual Evaporation (mm),"Avg Relative Humidity (morning, %)","Avg Relative Humidity (afternoon, %)",Avg Sunshine (hour/day),Avg Cloudy (hour/day),Avg Solar Radiation (cal/cm^2/day),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur,Rainfall (mm)
3037,0.0,0.178295,0.330769,0.0,0.050934,0.88125,0.910798,0.540625,0.306667,0.321075,0.0,0.0,0.0,1.0,0.0
2528,0.545455,0.808786,0.696154,0.064613,0.183362,0.6125,0.633803,0.728125,0.370667,0.720909,0.0,1.0,0.0,0.0,6.6
2927,0.363636,0.640827,0.715385,0.116063,0.189009,0.9125,0.58216,0.55,0.498667,0.590527,1.0,0.0,0.0,0.0,0.6
475,0.272727,0.563307,0.65641,0.004487,0.220713,0.53125,0.413146,0.634375,0.370667,0.627637,0.0,0.0,0.0,1.0,0.0
2560,0.545455,0.813953,0.746154,0.004786,0.213922,0.84375,0.568075,0.796875,0.304,0.773694,1.0,0.0,0.0,0.0,14.0


In [37]:
_, cols_to_drop = pearson_correlation_fs(weather_avg_train_df, cls='Rainfall (mm)', threshold_corr=0.75)
cols_to_drop

dropping Avg Max Temp. (degree Celcius) from (Avg Max Temp. (degree Celcius), Avg Min Temp. (degree Celcius))
dropping Avg Cloudy (hour/day) from (Avg Cloudy (hour/day), Avg Relative Humidity (afternoon, %))
dropping Avg Sunshine (hour/day) from (Avg Cloudy (hour/day), Avg Sunshine (hour/day))
dropping Avg Sunshine (hour/day) from (Avg Solar Radiation (cal/cm^2/day), Avg Sunshine (hour/day))


['Avg Max Temp. (degree Celcius)',
 'Avg Sunshine (hour/day)',
 'Avg Cloudy (hour/day)']