In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../Data/wildfires.txt", delimiter='\t')


In [11]:
from math import log2
import itertools 

def information_gain(df, target, columns):
    """
    Calculate the information gain for all the columns to be presented at the feature selection screen. 
    Mean value will be used to bucket the values.
    """
    df_output = pd.DataFrame()
    
    target_vals = list(set(df[target]))
    val1 = target_vals[0]
    val2 = target_vals[1]
    
    df_entropy = -(len(df[df[target]==val1])/len(df))*log2(len(df[df[target]==val1])/len(df)) - (len(df[df[target]==val2])/len(df))*log2(len(df[df[target]==val2])/len(df))
    
    
    for col in columns:        
        mean_val = np.mean(df[col])
        high_val= df[df[col]>=mean_val]
        low_val = df[df[col]<mean_val]
        try:
            # Some columns like rain have no fires above the mean value so the below equation breaks down - this is a very significant feature to include
            if len(set(high_val[target])) ==2 :
                high_exp1 = -(len(high_val[high_val[target]==val1]) / len(high_val))*log2(len(high_val[high_val[target]==val1])/len(high_val)) 
                high_exp2 = -(len(high_val[high_val[target]==val2]) / len(high_val))*log2(len(high_val[high_val[target]==val2])/len(high_val))
                high_ent =  high_exp1 + high_exp2
            else:
                high_ent=0

            if len(set(low_val[target])) ==2 :
                low_exp1 = -(len(low_val[low_val[target]==val1]) / len(low_val))*log2(len(low_val[low_val[target]==val1])/len(low_val)) 
                low_exp2 = -(len(low_val[low_val[target]==val2]) / len(low_val))*log2(len(low_val[low_val[target]==val2])/len(low_val))
                low_ent =  low_exp1 + low_exp2
            else:
                low_ent=0

            info_gain = df_entropy - (len(high_val)/len(df))*high_ent - (len(low_val)/len(df))*low_ent
            df_output = df_output.append([[col,np.round(mean_val,2),np.round(info_gain, 2)]])
        except:
            pass
    df_output = df_output.rename(columns={0:'Column', 1:"Mean Value", 2:"Information Gain"})
    return(df_output)
        
        

def feature_selection(df):
    
    print(df.dtypes)
    
    target = input('Pick the target variable')
    
    df[target] = [x.strip() for x in df[target]]
    df_cols = df.drop(target ,axis=1)
   
    ig = information_gain(df, target, df_cols.columns)
    info_cols = pd.DataFrame(df_cols.dtypes)
    info_cols.reset_index(inplace=True)
    info_cols = info_cols.rename(columns={'index':'Column', 0:'Data Type'})
    info_cols= info_cols.merge(ig, on='Column').sort_values("Information Gain" ,ascending=False)
    print("Information gain calculated for bins either side of mean values for each feature")
    print(info_cols)
    cols = input("Please enter the desired columns for anaylsis: ")
    cols = [x.strip() for x in cols.split(',')]
    return target, cols

In [12]:
target, cols = feature_selection(df)

yes               object
year               int64
temp               int64
humidity           int64
rainfall         float64
drought_code     float64
buildup_index    float64
day                int64
month              int64
wind_speed         int64
dtype: object
Information gain calculated for bins either side of mean values for each feature
          Column Data Type  Mean Value  Information Gain
5  buildup_index   float64       16.54              0.30
3       rainfall   float64        0.82              0.25
4   drought_code   float64       48.54              0.23
1           temp     int64       31.91              0.18
2       humidity     int64       62.28              0.11
6            day     int64       15.69              0.03
7          month     int64        7.55              0.03
0           year     int64     2011.98              0.00
8     wind_speed     int64       16.45              0.00


In [15]:
target

'yes'

In [16]:
cols

['buildup_index', 'rainfall', 'drought_code', 'temp']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df[cols], df[target], test_size=0.3, random_state=10)

In [23]:
y_test

59     yes
35     yes
47     yes
98      no
101     no
55     yes
75      no
58     yes
174     no
5      yes
178     no
130    yes
159    yes
39      no
78      no
19      no
1       no
2       no
117     no
10     yes
95     yes
6      yes
162    yes
121     no
20     yes
132    yes
99      no
168    yes
72      no
61     yes
      ... 
68      no
46     yes
166    yes
179     no
138     no
70      no
175     no
152    yes
182    yes
155    yes
52     yes
129     no
83      no
110    yes
143     no
49      no
63     yes
194    yes
7      yes
26     yes
105    yes
60     yes
111     no
127    yes
203     no
165    yes
128     no
24     yes
43      no
76      no
Name: yes, Length: 62, dtype: object