<a href="https://colab.research.google.com/github/ferdouszislam/Weather-WaterLevel-Prediction-ML/blob/main/Notebooks/feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
weather_avg_train_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Weather-WaterLevel-Prediction-ML/main/Datasets/brri-datasets/final-dataset/train/brri-weather_avg_train_regression.csv')
weather_avg_train_df.sample(5)

Unnamed: 0,Month,Avg Min Temp. (degree Celcius),Avg Max Temp. (degree Celcius),Avg Rainfall (mm),Avg Actual Evaporation (mm),"Avg Relative Humidity (morning, %)","Avg Relative Humidity (afternoon, %)",Avg Sunshine (hour/day),Avg Cloudy (hour/day),Avg Solar Radiation (cal/cm^2/day),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur,Rainfall (mm)
2705,0.727273,0.741602,0.658974,0.011367,0.11545,0.95,0.732394,0.528125,0.437333,0.49742,1.0,0.0,0.0,0.0,0.0
3605,0.545455,0.816537,0.653846,0.0,0.169779,0.69,0.605634,0.415625,0.637333,0.488128,0.0,0.0,0.0,1.0,0.0
2098,0.818182,0.696382,0.652564,0.0,0.169779,0.75625,0.521127,0.590625,0.328,0.47181,0.0,0.0,0.0,1.0,0.0
117,0.818182,0.801034,0.769231,0.014957,0.135823,0.84375,0.661972,0.653125,0.282667,0.528753,1.0,0.0,0.0,0.0,0.0
2492,0.181818,0.452196,0.548718,0.005085,0.169779,0.66875,0.507042,0.509375,0.421333,0.494545,0.0,0.0,0.0,1.0,0.0


## 1. Pearson Correlation based Feature Selection

In [3]:
def pearson_correlation_fs(_df, cls, threshold_corr=0.8):
    """
    function to check correlation of each pair of features a
    and discard the one from the pair with corr > 'threshold_corr' 
    among the pair, the one with lower corr with the 'cls' is dropped 
    """
    
    df = _df.copy()
    
    corr_matrix = df.corr()
    cols_to_drop = set() # keep only unique features
    
    # get the class column index
    for idx in range(len(corr_matrix.columns)):
        if corr_matrix.columns[idx]==cls :
            cls_col_idx = idx
            break
    
    # find the features to drop
    for col1_idx in range(len(corr_matrix.columns)):
        for col2_idx in range(col1_idx):
            col1 = corr_matrix.columns[col1_idx]
            col2 = corr_matrix.columns[col2_idx]
            
            if col1 == cls or col2 == cls or col1 in cols_to_drop or col2 in cols_to_drop:
                continue
                
            if abs(corr_matrix.iloc[col1_idx, col2_idx]) > threshold_corr:
                if abs(corr_matrix.iloc[col1_idx, cls_col_idx]) < abs(corr_matrix.iloc[col2_idx, cls_col_idx]): 
                    col_to_drop = col1 
                else:
                    col_to_drop = col2
                
                print(f'dropping {col_to_drop} from ({col1}, {col2})')
                
                cols_to_drop.add(col_to_drop)
    
    cols_to_drop = list(cols_to_drop)
    df.drop(columns=cols_to_drop, inplace=True)
    
    return _df, cols_to_drop

In [4]:
_, cols_to_drop = pearson_correlation_fs(weather_avg_train_df, cls='Rainfall (mm)', threshold_corr=0.75)
cols_to_drop

dropping Avg Max Temp. (degree Celcius) from (Avg Max Temp. (degree Celcius), Avg Min Temp. (degree Celcius))
dropping Avg Cloudy (hour/day) from (Avg Cloudy (hour/day), Avg Relative Humidity (afternoon, %))
dropping Avg Sunshine (hour/day) from (Avg Solar Radiation (cal/cm^2/day), Avg Sunshine (hour/day))


['Avg Sunshine (hour/day)',
 'Avg Cloudy (hour/day)',
 'Avg Max Temp. (degree Celcius)']

## 2. Feature Selection using SeleckKBest

Useful links: 
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
- https://scikit-learn.org/stable/modules/feature_selection.html
- https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172

In [5]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, mutual_info_classif


def seleckKBest_fs(_df, cls, is_regression,
                   fixed_cols=['Station_Barisal', 'Station_Gazipur', 'Station_Rangpur', 'Station_Habiganj'], 
                   num_features=7, 
                   fs_method=mutual_info_regression):
    """
    parameters- training dataframe, class name, boolean to specify regression or classification
    returns dataframe and list of dropped columns
    """
    df = _df.copy()

    fixed_cols.append(cls)
    X = df.drop(columns=fixed_cols)
    y = df[cls]
    
    if is_regression:
      fs_method = mutual_info_regression
    else:
      fs_method = mutual_info_classif

    # select top 'num_features' features based on mutual info regression
    # total features would be 'num_features' + 1(station column) 
    selector = SelectKBest(fs_method, k=num_features)
    selector.fit(X, y)
    selected_cols = list(X.columns[selector.get_support()])

    cols_to_drop = []
    for col in df.columns:
        if col in [cls, 'Station_Barisal', 'Station_Gazipur', 'Station_Rangpur', 'Station_Habiganj']:
            continue
        elif col not in selected_cols:
            cols_to_drop.append(col)
            
    df.drop(columns=cols_to_drop, inplace=True)
    
    return df, cols_to_drop

In [6]:
_, cols_to_drop = seleckKBest_fs(_df=weather_avg_train_df, cls='Rainfall (mm)', is_regression=True)
cols_to_drop

['Avg Sunshine (hour/day)',
 'Avg Cloudy (hour/day)',
 'Avg Solar Radiation (cal/cm^2/day)']

## 3. Selection Sequential Feature Selection (SFS) 
Useful links:  
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html#sklearn.feature_selection.SequentialFeatureSelector
- https://scikit-learn.org/stable/modules/feature_selection.html
- https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172

In [7]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import LinearSVC, LinearSVR

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression


def selectSequential_fs(_df, cls, is_regression,
                        fixed_cols=['Station_Barisal', 'Station_Gazipur', 'Station_Rangpur', 'Station_Habiganj'], 
                        num_features=7, 
                        fs_method='forward'):
    """
    parameters- training dataframe, class name, boolean to specify regression or classification
    returns dataframe and list of dropped columns
    """
    df = _df.copy()

    fixed_cols.append(cls)
    X = df.drop(columns=fixed_cols)
    y = df[cls]
 
    if is_regression:
      estimator = LinearSVR(C=0.01, random_state=42)
      scoring='r2'
    else:
      estimator = LinearSVC(C=0.01, penalty="l1", dual=False, random_state=42)
      scoring = 'accuracy'
    
    # select top 'num_features' features based on mutual info regression
    # total features would be 'num_features' + 1(station column) 
    selector = SequentialFeatureSelector(estimator=estimator, n_features_to_select=num_features, cv=10, direction=fs_method, scoring=scoring)
    selector.fit(X, y)
    selected_cols = list(X.columns[selector.get_support()])

    cols_to_drop = []
    for col in df.columns:
        if col in [cls, 'Station_Barisal', 'Station_Gazipur', 'Station_Rangpur', 'Station_Habiganj']:
            continue
        elif col not in selected_cols:
            cols_to_drop.append(col)
            
    df.drop(columns=cols_to_drop, inplace=True)
    
    return df, cols_to_drop

In [8]:
_, cols_to_drop = selectSequential_fs(_df=weather_avg_train_df, cls='Rainfall (mm)', is_regression=True)
cols_to_drop

['Avg Min Temp. (degree Celcius)',
 'Avg Max Temp. (degree Celcius)',
 'Avg Relative Humidity (afternoon, %)']