In [6]:
# Module importations
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression

In [7]:
# Constants / Settings
plt.style.use('seaborn-whitegrid')

plt.rc('figure', autolayout = True)
plt.rc(
    'axes',
    labelweight = 'bold',
    labelsize = 'large',
    titleweight = 'bold',
    titlesize = 14,
    titlepad = 10
)

In [11]:
# Plot builder helper method
def plot_variance(pca, width = 8, dpi = 100):

    # Initialise figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)

    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(xlabel = 'Component', title = '% Explained Variance', ylim = (0.0, 1.0))

    # Cumulative variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], 'o-')
    axs[1].set(xlabel = 'Component', title = '% Cumulative Variance', ylim = (0.0, 1.0))

    # Build figure
    fig.set(figwidth = 8, dpi = 100)
    return axs    

In [12]:
# Make MI scores helper method
def make_mi_scores(X, y, discrete_figures):

    # Develop scores using regression
    mi_scores = mutual_info_regression(X, y, discrete_figures = discrete_figures)

    # Convert to data series
    mi_scores = pd.Series(mi_scores, name = 'MI Scores', index = X.columns)

    # Sort and return
    mi_scores = mi_scores.sort_values(ascending = False)
    return mi_scores

In [13]:
# Load data
autos = pd.read_csv(r'C:\Developer\scratch-pad-python\Datasets\Automobile_data.csv')

In [14]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [15]:
autos.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
symboling,205.0,0.834146,1.245307,-2.0,0.0,1.0,2.0,3.0
wheel-base,205.0,98.756585,6.021776,86.6,94.5,97.0,102.4,120.9
length,205.0,174.049268,12.337289,141.1,166.3,173.2,183.1,208.1
width,205.0,65.907805,2.145204,60.3,64.1,65.5,66.9,72.3
height,205.0,53.724878,2.443522,47.8,52.0,54.1,55.5,59.8
curb-weight,205.0,2555.565854,520.680204,1488.0,2145.0,2414.0,2935.0,4066.0
engine-size,205.0,126.907317,41.642693,61.0,97.0,120.0,141.0,326.0
compression-ratio,205.0,10.142537,3.97204,7.0,8.6,9.0,9.4,23.0
city-mpg,205.0,25.219512,6.542142,13.0,19.0,24.0,30.0,49.0
highway-mpg,205.0,30.75122,6.886443,16.0,25.0,30.0,34.0,54.0


In [16]:
# Select and standardise a small number of features with high Mutual Information
features = ['highway-mpg', 'engine-size', 'horsepower', 'curb-weight']

# Create training and target datasets
X = autos.copy()
y = X.pop('price')
X = X.loc[:, features]

# Standardise features
X_scaled = (X - X.mean(axis = 0)) / X.std(axis = 0)