## <span style="color : green"> Feature Engineering </span>

# <center> Table of Contents </center>

1. Split data into training and testing 
2. Drop independent variable based on - mean, median, variance
1. Finding outliers based on z-score and box-plot
1. correlation between independent variables
1. Correlation between independent variable and dependent
1. Select Features based on Chi-Square Test, Information Gain, VarianceThreshold
1. Feature Extraction using PCA and LDA
1. Mean of Pixel value of Image 

In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import chi2, SelectKBest, mutual_info_classif, VarianceThreshold
from sklearn.model_selection import train_test_split
from warnings import filterwarnings


filterwarnings('ignore')

In [78]:
df = pd.read_csv('../Datasets/Toyato.csv')
df.head()

Unnamed: 0,Price,Age,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,Automatic,cc,Doors,Gears,Quarterly_Tax,Weight,Mfr_Guarantee,BOVAG_Guarantee,Guarantee_Period
0,12950,23,10,2002,46986,Diesel,90,1,0,2000,3.0,Five,210,1165,0.0,1,3
1,12950,23,10,2002,72937,Diesel,90,1,0,2000,3.0,Five,210,1165,0.0,1,3
2,12950,24,9,2002,41711,Diesel,90,1,0,2000,3.0,Five,210,1165,1.0,1,3
3,13950,26,7,2002,48000,Diesel,90,0,0,2000,3.0,Five,210,1165,1.0,1,4
4,17950,30,3,2002,38500,Diesel,90,0,0,2000,3.0,Five,210,1170,1.0,1,5


In [79]:
df.shape

(50, 17)

## Split data into training and testing

In [80]:
x = df.drop('Fuel_Type', axis=1)
x = x.select_dtypes(include=['int64', 'float64'])
y = df[['Fuel_Type']]

xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2, random_state=0)

## Drop independent variable based on - 

    1. If mean less than median 
    2. bIf variance is less than the theshold

In [81]:
# Checking if mean is less than median

low_mean_features = list()

for col in x.columns:
    mean = x[col].mean()
    median = x[col].median()
    if mean < median :
        low_mean_features.append(col)

print(low_mean_features)

['Age', 'Met_Color', 'Mfr_Guarantee', 'BOVAG_Guarantee']


In [82]:
# Checking if variance is less than threshold

low_variance_features = list()
threshold = 0.1

for col in x.columns:
    variance = x[col].var()
    if variance < threshold :
        low_variance_features.append(col)

print(low_variance_features)

['Mfg_Year', 'Automatic', 'BOVAG_Guarantee']


## Finding outliers based on z-score and box-plot

In [83]:
x['z-score-price'] = ( x.Price - x.Price.mean() ) / x.Price.std()
x = x[(x['z-score-price'] > -3) & (x['z-score-price'] < 3) ]
x.head()

Unnamed: 0,Price,Age,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,cc,Doors,Quarterly_Tax,Weight,Mfr_Guarantee,BOVAG_Guarantee,Guarantee_Period,z-score-price
0,12950,23,10,2002,46986,90,1,0,2000,3.0,210,1165,0.0,1,3,-0.849159
1,12950,23,10,2002,72937,90,1,0,2000,3.0,210,1165,0.0,1,3,-0.849159
2,12950,24,9,2002,41711,90,1,0,2000,3.0,210,1165,1.0,1,3,-0.849159
3,13950,26,7,2002,48000,90,0,0,2000,3.0,210,1165,1.0,1,4,-0.454219
4,17950,30,3,2002,38500,90,0,0,2000,3.0,210,1170,1.0,1,5,1.125537


## correlation between independent variables

In [84]:
threshold = 0.75

correlated_features = []

correlation_matrix = x.corr()

for i in range(correlation_matrix.shape[0]):
    for j in range(i+1, correlation_matrix.shape[1]):
        if abs(correlation_matrix.iloc[i,j]) > threshold:
            correlated_features.append(correlation_matrix.columns[i])

print(list(set(correlated_features)))

['Mfg_Month', 'Price', 'Age', 'Quarterly_Tax', 'cc']


## Correlation between independent variable and dependent

In [87]:
encoder = LabelEncoder()
labels = encoder.fit_transform(y)

x.fillna(x.mean(), inplace=True)

In [88]:
# To find out the unrelated attribute w/r to target 

threshold = 0.1

uncorrelated_features = []

for feature in x.columns:
    correlation, _ = pearsonr(x[feature] ,labels )
    if abs(correlation) < threshold:
        uncorrelated_features.append(feature)
        
print(uncorrelated_features)

['Met_Color', 'Automatic']


## Select Features based on Chi-Square Test, SelectPercentile, VarianceThreshold

In [97]:
# Using Chi-Square Test 

selector = SelectKBest(score_func=chi2, k=3)
new_df = selector.fit_transform(abs(x), labels)

print(f'The selected attributes were : {list(x.columns[selector.get_support()])}')

The selected attributes were : ['KM', 'cc', 'Quarterly_Tax']


In [99]:
# Using Information Gain 

selector = SelectKBest(score_func=mutual_info_classif, k=3)
new_df = selector.fit_transform(abs(x), labels)

print(f'The selected attributes were : {list(x.columns[selector.get_support()])}')

The selected attributes were : ['HP', 'cc', 'Quarterly_Tax']


In [103]:
# Using Variance Threshold

selector = VarianceThreshold(0.4)
new_df = selector.fit_transform(x)
print(f'Old Feature shape = {x.shape}\nNew Feature Shape = {new_df.shape}')

Old Feature shape = (50, 16)
New Feature Shape = (50, 11)


## Feature Extraction using PCA and LDA

In [106]:
# Principle Component Analysis

analyser = PCA(n_components=7)
extracted = analyser.fit_transform(x)

print(np.shape(extracted))

(50, 7)


In [114]:
# Linear Discriminant Analysis

analyser = LinearDiscriminantAnalysis(n_components=1)
extracted = analyser.fit_transform(x, y)
print(np.shape(extracted))

(50, 1)


## Mean of Pixel value of Image

In [104]:
from PIL import Image

image = Image.open("image.png")

image_array = np.array(image)
mean_pixels= np.mean(image_array)

print(f"Mean pixel value: {round(mean_pixels,2)}")

Mean pixel value: 72.43
