# Feature Selection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(R"D:\OneDrive\Documents\DATA SCIENCE\KAGGLE\ClassificationWithAnAcademicSuccessDataset\data\playground-series-s4e6\data1.csv")

In [26]:
# pd.set_option("display.max.columns", None)
# pd.set_option("display.max_rows", None)

In [4]:
# pd.set_option("display.max.columns", None)

In [5]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Daytime/evening attendance,Previous qualification (grade),Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,...,Father higher education,Father basic education,Father technical education,Father no education,Father unknown education,Previous qualification: Secondary,Previous qualification: Higher,Previous qualification: Basic,Previous qualification: Technical course,Target
0,1,126.0,122.6,0,0,0,1,0,1,18,...,0,1,0,0,0,1,0,0,0,Graduate
1,1,125.0,119.8,1,0,0,1,0,0,18,...,0,1,0,0,0,1,0,0,0,Dropout
2,1,137.0,144.7,0,0,0,1,1,0,18,...,0,1,0,0,0,1,0,0,0,Dropout
3,1,131.0,126.1,1,0,0,1,0,1,18,...,1,0,0,0,0,1,0,0,0,Enrolled
4,1,132.0,120.1,1,0,0,1,0,0,18,...,0,1,0,0,0,1,0,0,0,Graduate


## Univariate Feature Selection

In this part we will try to select the best features based on univariate statistical tests, looking for statistically significant relationship between each feature and the target.
<br>
We will use Scikit-learn to expose feature selection routines as objects that implement the transform method and get Fscore and P-value for each parameter. Features with a too high P-value will be discared (unlikely to be related to the target).

In [6]:
# Choosing target variable and feature matrix

X = df.drop("Target",axis = 1)   # Feature Matrix
y = df["Target"]          # Target Variable


In [7]:
from sklearn.feature_selection import SelectKBest, f_classif


def select_kbest_classif(data_frame, target, k=5): 
    """
    Selecting K-Best features for regression
    :param k: number of features selected
    :returns feature_scores: scores for each feature in the data as 
    pandas DataFrame
    """
    feat_selector = SelectKBest(f_classif, k=k)
    _ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target])
    
    feat_scores = pd.DataFrame()
    feat_scores["F Score"] = feat_selector.scores_
    feat_scores["P Value"] = feat_selector.pvalues_
    feat_scores["Support"] = feat_selector.get_support()
    feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns
    
    return feat_scores

In [8]:
Univariate_tab = select_kbest_classif(df, "Target", k=13)
Univariate_tab = Univariate_tab.sort_values(["F Score", "P Value"], ascending=[False, False])

Univariate_tab

Unnamed: 0,F Score,P Value,Support,Attribute
20,60692.223212,0.000000,True,Curricular units 2nd sem (approved)
21,49461.256514,0.000000,True,Curricular units 2nd sem (grade)
14,43427.747186,0.000000,True,Curricular units 1st sem (approved)
15,34578.966990,0.000000,True,Curricular units 1st sem (grade)
6,9562.427508,0.000000,True,Tuition fees up to date
...,...,...,...,...
10,0.320546,0.725754,False,International
81,0.285531,0.751616,False,Natio_Portuguese
46,0.283470,0.753167,False,Application mode_27
85,0.283470,0.753167,False,Natio_Lithuanian


We found our 13 best parameters (Support column shows "True"). 
Let's create a Features_Select summary tab:

In [9]:
Features_Select = pd.DataFrame({'Feature': X.columns})
Features_Select['Univariate'] = np.where(Univariate_tab['Support'] == True, 1, 0)

Features_Select

Unnamed: 0,Feature,Univariate
0,Daytime/evening attendance,1
1,Previous qualification (grade),1
2,Admission grade,1
3,Displaced,1
4,Educational special needs,1
...,...,...
134,Father unknown education,0
135,Previous qualification: Secondary,0
136,Previous qualification: Higher,0
137,Previous qualification: Basic,0


## Multivariate Analysis

Model-based selection considers all features at once, and can capture interactions. 

### Variable Selection using LASSO (L1 penalization)

Logistic regression with L1 Regularisation for Classification:

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder

In [11]:
# Encode the categorical target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Apply Logistic Regression with L1 regularization
lassomod = LogisticRegression(penalty='l1', solver='liblinear', C=0.01).fit(X, y_encoded)


In [12]:
# Select features based on the Logistic Regression model
# show if the variable influence the y 
# 0 means lasso or univariable model tells to not include the variable

model = SelectFromModel(lassomod, prefit=True)
selected_features = X.columns[(model.get_support())]

selected_features

Index(['Daytime/evening attendance', 'Previous qualification (grade)',
       'Admission grade', 'Displaced', 'Debtor', 'Tuition fees up to date',
       'Gender', 'Scholarship holder', 'Age at enrollment',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)', 'Unemployment rate',
       'Inflation rate', 'GDP', 'Marital status_single', 'Application mode_17',
       'Application mode_39', 'Application order_1', 'Application order_4',
       'Course_9085', 'Course_9119', 'Course_9147', 'Course_9238',
       'Course_9500', 'Course_96

In [13]:
Features_Select['Lasso'] = model.get_support().astype('int64')

Features_Select

Unnamed: 0,Feature,Univariate,Lasso
0,Daytime/evening attendance,1,1
1,Previous qualification (grade),1,1
2,Admission grade,1,1
3,Displaced,1,1
4,Educational special needs,1,0
...,...,...,...
134,Father unknown education,0,0
135,Previous qualification: Secondary,0,1
136,Previous qualification: Higher,0,0
137,Previous qualification: Basic,0,1


### Variable Selection using Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Encode the categorical target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Apply Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42).fit(X, y_encoded)

In [15]:
# Select features based on the Random Forest model
model = SelectFromModel(rf, prefit=True)
selected_features = X.columns[(model.get_support())]

selected_features

Index(['Previous qualification (grade)', 'Admission grade', 'Displaced',
       'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)', 'Unemployment rate',
       'Inflation rate', 'GDP', 'Application mode_1', 'Application mode_39',
       'Father basic education'],
      dtype='object')

In [16]:
Features_Select['RandomForest'] = model.get_support().astype('int64')
Features_Select

Unnamed: 0,Feature,Univariate,Lasso,RandomForest
0,Daytime/evening attendance,1,1,0
1,Previous qualification (grade),1,1,1
2,Admission grade,1,1,1
3,Displaced,1,1,1
4,Educational special needs,1,0,0
...,...,...,...,...
134,Father unknown education,0,0,0
135,Previous qualification: Secondary,0,1,0
136,Previous qualification: Higher,0,0,0
137,Previous qualification: Basic,0,1,0


### Variable Selection using Gradient Boosting regression

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

In [18]:
# Encode the categorical target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

gbmod = GradientBoostingClassifier(n_estimators=100, random_state=42).fit(X, y_encoded)

In [19]:
# Select features based on the Gradient Boosting model
model = SelectFromModel(gbmod, prefit=True)
selected_features = X.columns[model.get_support()]

selected_features

Index(['Tuition fees up to date', 'Scholarship holder',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)'],
      dtype='object')

In [20]:
Features_Select['GradientBoost'] = model.get_support().astype('int64')
Features_Select

Unnamed: 0,Feature,Univariate,Lasso,RandomForest,GradientBoost
0,Daytime/evening attendance,1,1,0,0
1,Previous qualification (grade),1,1,1,0
2,Admission grade,1,1,1,0
3,Displaced,1,1,1,0
4,Educational special needs,1,0,0,0
...,...,...,...,...,...
134,Father unknown education,0,0,0,0
135,Previous qualification: Secondary,0,1,0,0
136,Previous qualification: Higher,0,0,0,0
137,Previous qualification: Basic,0,1,0,0


### Variable Selection using SVM regression

In [21]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [22]:
# Encode the categorical target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [23]:
# Apply SVM Classifier
svm = SVC(kernel='linear')  # Use linear kernel for feature selection
svm.fit(X, y_encoded)

# Select features based on the SVM model
model = SelectFromModel(svm, prefit=True)
selected_features = X.columns[model.get_support()]

selected_features

Index(['Debtor', 'Tuition fees up to date', 'Scholarship holder',
       'Curricular units 1st sem (approved)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (approved)', 'Marital status_widower',
       'Marital status_legally_separated', 'Application mode_2',
       'Application mode_10', 'Application mode_18', 'Application mode_27',
       'Application mode_39', 'Application mode_44', 'Application mode_53',
       'Course_33', 'Course_171', 'Course_979', 'Course_8014', 'Course_9085',
       'Course_9119', 'Course_9147', 'Course_9238', 'Course_9556',
       'Course_9853', 'Natio_German', 'Natio_Spanish', 'Natio_Italian',
       'Natio_Lithuanian', 'Natio_Angolan', 'Natio_Guinean',
       'Natio_Mozambican', 'Natio_Santomean', 'Natio_Brazilian',
       'Natio_Romanian', 'Natio_Moldova ', 'Natio_Ukrainian', 'Natio_Russian',
       'Father student', 'Father managers and executives',
       'Father intellectual and scientific professionals',
       'Fathe

In [24]:
Features_Select['SVM'] = model.get_support().astype('int64')
Features_Select

Unnamed: 0,Feature,Univariate,Lasso,RandomForest,GradientBoost,SVM
0,Daytime/evening attendance,1,1,0,0,0
1,Previous qualification (grade),1,1,1,0,0
2,Admission grade,1,1,1,0,0
3,Displaced,1,1,1,0,0
4,Educational special needs,1,0,0,0,0
...,...,...,...,...,...,...
134,Father unknown education,0,0,0,0,1
135,Previous qualification: Secondary,0,1,0,0,0
136,Previous qualification: Higher,0,0,0,0,0
137,Previous qualification: Basic,0,1,0,0,0


### Summarization and Selection of Variables 

In [29]:
Features_Select['Sum'] =  np.sum(Features_Select,axis=1)

Features_Select

  return reduction(axis=axis, out=out, **passkwargs)


Unnamed: 0,Feature,Univariate,Lasso,RandomForest,GradientBoost,SVM,Sum
0,Daytime/evening attendance,1,1,0,0,0,2
1,Previous qualification (grade),1,1,1,0,0,3
2,Admission grade,1,1,1,0,0,3
3,Displaced,1,1,1,0,0,3
4,Educational special needs,1,0,0,0,0,1
5,Debtor,1,1,1,0,1,4
6,Tuition fees up to date,1,1,1,1,1,5
7,Gender,1,1,1,0,0,3
8,Scholarship holder,1,1,1,1,1,5
9,Age at enrollment,1,1,1,0,0,3


In [30]:
Features_Select.groupby('Sum')['Feature'].count()


Sum
0    46
1    59
2    17
3    11
4     4
5     2
Name: Feature, dtype: int64

We can now decide a threshold for selecting our variables:

In [34]:
columntodrop = Features_Select[Features_Select['Sum']<=1]['Feature']

In [35]:
columntodrop =  columntodrop.to_numpy()

columntodrop

array(['Educational special needs', 'International',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (without evaluations)',
       'Marital status_single', 'Marital status_married',
       'Marital status_widower', 'Marital status_divorced',
       'Marital status_facto_union', 'Marital status_legally_separated',
       'Application mode_1', 'Application mode_2', 'Application mode_3',
       'Application mode_4', 'Application mode_5', 'Application mode_7',
       'Application mode_9', 'Application mode_10', 'Application mode_12',
       'Application mode_15', 'Application mode_16',
       'Application mode_17', 'Application mode_18',
       'Application mode_26', 'Application mode_27',
       'Application mode_35', 'Application mode_42',
       'Application mode_43', 'Application mode_44',
       'Application mode_51', 'Application mode_53',
       'Application order_0', 'Application order_1',
    

In [36]:
# Variables to keep
Features_Select[Features_Select['Sum']>=2]   

Unnamed: 0,Feature,Univariate,Lasso,RandomForest,GradientBoost,SVM,Sum
0,Daytime/evening attendance,1,1,0,0,0,2
1,Previous qualification (grade),1,1,1,0,0,3
2,Admission grade,1,1,1,0,0,3
3,Displaced,1,1,1,0,0,3
5,Debtor,1,1,1,0,1,4
6,Tuition fees up to date,1,1,1,1,1,5
7,Gender,1,1,1,0,0,3
8,Scholarship holder,1,1,1,1,1,5
9,Age at enrollment,1,1,1,0,0,3
11,Curricular units 1st sem (credited),1,1,0,0,0,2


According to the results, we can see which parameters are the most relevant for our model. We will choose to keep variables with Sum>3. 

In [38]:
df_selected = df.copy()
df_selected = df_selected.drop(columntodrop, axis=1)


In [39]:
df_selected.head()

Unnamed: 0,Daytime/evening attendance,Previous qualification (grade),Admission grade,Displaced,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,Curricular units 1st sem (credited),...,Course_9238,Course_9853,Father student,Father intermediate technicians and professionals,Father in personal service,Father other situations and unknown,Mother student,Mother no education,Father basic education,Target
0,1,126.0,122.6,0,0,1,0,1,18,0,...,1,0,0,0,1,0,0,0,1,Graduate
1,1,125.0,119.8,1,0,1,0,0,18,0,...,1,0,0,0,0,0,0,0,1,Dropout
2,1,137.0,144.7,0,0,1,1,0,18,0,...,0,0,0,1,0,0,0,0,1,Dropout
3,1,131.0,126.1,1,0,1,0,1,18,0,...,0,0,0,0,0,0,0,0,0,Enrolled
4,1,132.0,120.1,1,0,1,0,0,18,0,...,0,0,0,0,0,0,0,0,1,Graduate


In [41]:
# Export our cleaned dataframe to csv (to use it in Part 5)

df_selected.to_csv(R"D:\OneDrive\Documents\DATA SCIENCE\KAGGLE\ClassificationWithAnAcademicSuccessDataset\data\playground-series-s4e6\data2.csv")
