# Import libraries

In [1]:
# Scientific libraries

import numpy as np
import math

# Visualization Libraries

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Data analysis and ML libraries

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

#feature selector
from sklearn.feature_selection import SelectKBest, chi2, f_classif

#pipeline
from sklearn.pipeline import Pipeline, make_pipeline

#warnings
import warnings
warnings.filterwarnings("ignore")

# Import training and test sets

In [2]:
X_train = pd.read_csv('Data/training_feat.csv')
y_train = pd.read_csv('Data/training_target.csv')
X_test = pd.read_csv('Data/test_feat.csv')
y_test = pd.read_csv('Data/test_target.csv')

In [3]:
X_train.shape

(183579, 21)

# Feature selection: model independent techniques

## SelectKBest

Test using two score functions

- chi2
- f_classif

In [4]:
def selector_Kbest(X, y, score_func=chi2, k=8):

    #Instantiate KBest selector
    selector = SelectKBest(score_func=score_func, k=k)
    selector.fit(X,y)
    
    # Get the selected feature indices
    selected_columns = selector.get_support(indices=True)
    important_features = X_train.columns[selected_columns].tolist()
    #print(selector.scores_)
    #print(selector.pvalues_)
    df = pd.DataFrame(selector.scores_,X_train.columns).rename(columns={0:'score'})
    print(df.sort_values(by='score',ascending=False))

    return important_features

In [5]:
#using kbest chi2

best_features_chi2 = selector_Kbest(X_train, y_train, score_func=chi2, k=9)


                           score
Physical_health     77409.074223
BMI                 12287.060612
Mental_health        8469.316193
Age                  6860.252989
High_BP              6482.689469
Difficulty_walking   6236.702230
General_health       6060.169451
Heart_condition      4678.899087
High_Chol            3855.553525
Income               2639.123665
Stroke               1685.868465
Heavy_drinking        752.372835
Physical_activity     493.727237
Education             373.592865
Smoker                206.310654
Sex                   108.644496
Veggies                62.163159
No_Doctor_bc_Cost      43.954521
Fruits                 39.372131
Chol_check             38.737227
Health_plan             7.216212


In [7]:
#using f_classif
best_features_f_classif = selector_Kbest(X_train, y_train, score_func=f_classif, k=9)

                           score
General_health      15028.459921
High_BP             12696.639767
BMI                  8028.929585
Difficulty_walking   7990.290227
High_Chol            7172.991333
Age                  5995.341829
Heart_condition      5368.293621
Physical_health      4532.827198
Income               3621.117911
Education            1906.303916
Physical_activity    1875.211102
Stroke               1781.432236
Chol_check            963.943424
Heavy_drinking        804.709027
Mental_health         500.257213
Smoker                386.635201
Veggies               302.338736
Sex                   193.495629
Health_plan           133.841125
Fruits                101.570460
No_Doctor_bc_Cost      48.478989


In [8]:
best_features_chi2

['High_BP',
 'High_Chol',
 'BMI',
 'Heart_condition',
 'General_health',
 'Mental_health',
 'Physical_health',
 'Difficulty_walking',
 'Age']

In [9]:
best_features_f_classif

['High_BP',
 'High_Chol',
 'BMI',
 'Heart_condition',
 'General_health',
 'Physical_health',
 'Difficulty_walking',
 'Age',
 'Income']

In [10]:
#### For now let's work with f_classif

In [11]:
#New training set

In [12]:
X_train = X_train[best_features_f_classif]
X_test = X_test[best_features_f_classif]

In [13]:
#from pathlib import Path 
X_train.to_csv('Data/best_training_feat.csv', index=False)
X_test.to_csv('Data/best_test_feat.csv', index=False)