In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1. Pearson Correlation based Feature Selection

In [2]:
def pearson_correlation_fs(_df, cls, threshold_corr=0.8):
    """
    function to check correlation of each pair of features a
    and discard the one from the pair with corr > 'threshold_corr' 
    among the pair, the one with lower corr with the 'cls' is dropped 
    """
    
    df = _df.copy()
    
    corr_matrix = df.corr()
    cols_to_drop = set() # keep only unique features
    
    # get the class column index
    for idx in range(len(corr_matrix.columns)):
        if corr_matrix.columns[idx]==cls :
            cls_col_idx = idx
            break
    
    # find the features to drop
    for col1_idx in range(len(corr_matrix.columns)):
        for col2_idx in range(col1_idx):
            if corr_matrix.columns[col1_idx] == cls or corr_matrix.columns[col2_idx] == cls:
                continue
                
            if abs(corr_matrix.iloc[col1_idx, col2_idx]) > threshold_corr:
                if abs(corr_matrix.iloc[col1_idx, cls_col_idx]) < abs(corr_matrix.iloc[col2_idx, cls_col_idx]): 
                    col_to_drop = corr_matrix.columns[col1_idx] 
                else:
                    col_to_drop = corr_matrix.columns[col2_idx]
                
                print(f'dropping {col_to_drop} from ({corr_matrix.columns[col1_idx]}, {corr_matrix.columns[col2_idx]})')
                
                cols_to_drop.add(col_to_drop)
    
    cols_to_drop = list(cols_to_drop)
    _df.drop(columns=cols_to_drop)
    
    return _df, cols_to_drop

In [3]:
weather_avg_train_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Weather-WaterLevel-Prediction-ML/main/Datasets/brri-datasets/final-dataset/train/brri-weather_avg_train_regression.csv')
weather_avg_train_df.sample(5)

Unnamed: 0,Month,Avg Min Temp. (degree Celcius),Avg Max Temp. (degree Celcius),Avg Rainfall (mm),Avg Actual Evaporation (mm),"Avg Relative Humidity (morning, %)","Avg Relative Humidity (afternoon, %)",Avg Sunshine (hour/day),Avg Cloudy (hour/day),Avg Solar Radiation (cal/cm^2/day),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur,Rainfall (mm)
3087,0.181818,0.531008,0.682051,0.0,0.119498,0.80625,0.492958,0.628125,0.32,0.584367,0.0,0.0,1.0,0.0,0.0
2876,0.636364,0.806202,0.69359,0.204607,0.227504,0.7375,0.784038,0.38125,0.618667,0.456704,0.0,1.0,0.0,0.0,0.0
3426,0.818182,0.683463,0.608974,0.0,0.135823,0.65,0.596244,0.75,0.176,0.541454,0.0,0.0,0.0,1.0,0.0
1252,0.090909,0.404393,0.557692,0.008974,0.186757,0.5,0.262911,0.65,0.253333,0.492462,0.0,1.0,0.0,0.0,0.0
3552,0.636364,0.850129,0.752564,0.01376,0.149406,0.60625,0.619718,0.50625,0.512,0.552402,0.0,1.0,0.0,0.0,7.6


In [4]:
_, cols_to_drop = pearson_correlation_fs(weather_avg_train_df, cls='Rainfall (mm)', threshold_corr=0.75)
cols_to_drop

dropping Avg Max Temp. (degree Celcius) from (Avg Max Temp. (degree Celcius), Avg Min Temp. (degree Celcius))
dropping Avg Cloudy (hour/day) from (Avg Cloudy (hour/day), Avg Relative Humidity (afternoon, %))
dropping Avg Sunshine (hour/day) from (Avg Cloudy (hour/day), Avg Sunshine (hour/day))
dropping Avg Sunshine (hour/day) from (Avg Solar Radiation (cal/cm^2/day), Avg Sunshine (hour/day))


['Avg Sunshine (hour/day)',
 'Avg Cloudy (hour/day)',
 'Avg Max Temp. (degree Celcius)']

## 2. Feature Selection using SeleckKBest

Useful links: 
- https://scikit-learn.org/stable/modules/feature_selection.html
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
- https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172

In [5]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression


def seleckKBest_fs(_df, cls,
                   fixed_cols=['Station_Barisal', 'Station_Gazipur', 'Station_Rangpur', 'Station_Habiganj'], 
                   num_features=7, 
                   fs_method=mutual_info_regression):
    """
    parameters- training dataframe, class name
    returns dataframe and list of dropped columns
    """
    df = _df.copy()

    fixed_cols.append(cls)
    X = df.drop(columns=fixed_cols)
    y = df[cls]
    
    # select top 'num_features' features based on mutual info regression
    # total features would be 'num_features' + 1(station column) 
    selector = SelectKBest(fs_method, k=num_features)
    selector.fit(X, y)
    selected_cols = list(X.columns[selector.get_support()])

    cols_to_drop = []
    for col in df.columns:
        if col in [cls, 'Station_Barisal', 'Station_Gazipur', 'Station_Rangpur', 'Station_Habiganj']:
            continue
        elif col not in selected_cols:
            cols_to_drop.append(col)
            
    df.drop(columns=cols_to_drop)
    
    return df, cols_to_drop

In [6]:
_, cols_to_drop = seleckKBest_fs(weather_avg_train_df, 'Rainfall (mm)')
cols_to_drop

['Avg Relative Humidity (morning, %)',
 'Avg Sunshine (hour/day)',
 'Avg Solar Radiation (cal/cm^2/day)']