# **Feature Selection Methods:**

### 1. Filter Method:

#### a. Pearson's Correlation Coefficient:

In [1]:
import pandas as pd
import seaborn as sns 
from scipy.stats import pearsonr # pearsonr only for numerical values as it works on statistical data

# Load the data 
tips= sns.load_dataset('tips')

X= tips.drop('tip', axis=1)
y=tips['tip']

# Calculate the pearsonr correlation between features and target variables 

correlation= tips.corr(method='pearson')['tip'].sort_values()

# now we will select features having corr>0.2 and <-0.2 as our final features
selected_features= correlation[abs(correlation) > 0.2].index.tolist()
selected_features 

  correlation= tips.corr(method='pearson')['tip'].sort_values()


['size', 'total_bill', 'tip']

### 2. Wrapper Methods
#### RFE (Recrusive Feature Elimination)

In [3]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Load the tips dataset
tips=sns.load_dataset('tips')
features=tips.drop('tip', axis=1)
target=tips['tip']

# Label encode the features 
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
features['sex']= le.fit_transform(features['sex'])
features['smoker']= le.fit_transform(features['smoker'])
features['day']= le.fit_transform(features['day'])
features['time']= le.fit_transform(features['time'])
features['size']= le.fit_transform(features['size'])

model=LinearRegression()
selector=RFE(model, n_features_to_select=10, step=1)
selector.fit(features, target)

selected_features=features.columns[selector.get_support()]
print("Selected Feature", selected_features)

Selected Feature Index(['total_bill', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')


#### SFS (Sequential Feature Selection)

In [18]:
import seaborn as sns
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

tips = sns.load_dataset('tips')
features = tips.drop('tip', axis=1)
target = tips['tip']

# Label encode the features 
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
features['sex']= le.fit_transform(features['sex'])
features['smoker']= le.fit_transform(features['smoker'])
features['day']= le.fit_transform(features['day'])
features['time']= le.fit_transform(features['time'])
features['size']= le.fit_transform(features['size'])

model = LinearRegression()
selector = SequentialFeatureSelector(model, k_features=3, forward=True, scoring='neg_mean_squared_error')
selector.fit(features, target)

selected_features = features.columns[list(selector.k_feature_idx_)]
print('Selected Features:', selected_features)

Selected Features: Index(['total_bill', 'sex', 'size'], dtype='object')


### 3. Embedded Methods

#### Lasso Regression 

In [20]:
from sklearn.linear_model import Lasso

# Load the tips dataset
tips=sns.load_dataset('tips')
features=tips.drop('tip', axis=1)
target=tips['tip']

# Label encode the features 
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
features['sex']= le.fit_transform(features['sex'])
features['smoker']= le.fit_transform(features['smoker'])
features['day']= le.fit_transform(features['day'])
features['time']= le.fit_transform(features['time'])
features['size']= le.fit_transform(features['size'])

model=Lasso(alpha=0.1)
model.fit(features, target)

selected_features=features.columns[model.coef_!=0]
print('Selected Features:', selected_features)

Selected Features: Index(['total_bill', 'size'], dtype='object')


### 4. Dimensionality Reduction Method

#### PCA

In [26]:
import pandas as pd 
import numpy as np 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the Diamonds dataset 
df = sns.load_dataset('diamonds')

# Convert the categorical variable to numerical 
df['cut'] = pd.factorize(df['cut'])[0]
df['color'] = pd.factorize(df['color'])[0]
df['clarity'] = pd.factorize(df['clarity'])[0]

# split the features and target variables 
X = df.drop('price', axis=1).values
y = df['price'].values

# standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA and select the most important features 
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X_scaled)
most_important_features=np.abs(pca.components_).argmax(axis=1)

# Print the most important features 
print("Most important features:", df.columns[most_important_features])

Most important features: Index(['price', 'table', 'depth', 'color'], dtype='object')
