# Exploring High Dimensional Data

In [None]:
# For nun-numeric data
df.describe(exclude='number')

In [None]:
df.drop('feature', axis=1)

In [None]:
# Seaborn's pairplot is excellent to visually explore small to medium sized dataset
sns.pairplot(df, hue='gender', diag_kind='hist')

**t-SNE**

In [None]:
# t-Distributed Stochastic Neighbor Embedding or t-SNE
# It is a powerful technique to visualize high dimensional data using feature extraction
# t-SNE will maximize the distance in two-dimensional space between observations that are most different in a high-dimensional space. 
# Because of this, observations that are similar will be close to one another and may become clustered. 
# Not work with non-numeric data 

In [None]:
# Non-numerical columns in the dataset
non_numeric = ['BMI_class','Height_class','Gender','Component','Branch']
df_numeric = df.drop(non_numeric, axis=1)

# Fitting t-SNE --> This project hight dimensional dataset onto a Numpy array with two dimensions.

from sklearn.manifold import TSNE
# Create a t-SNE model with learning rate 50
m = TSNE(learning_rate=50)

# Fit and transform the t-SNE model on the numeric dataset
tsne_features = m.fit_transform(df_numeric)
print(tsne_features.shape)

# Assign t-SNE features to the dataset
df['x'] = tsne_features[:,0]
df['y'] = tsne_features[:,1]

# Plot t-SNE
import seaborn as sns
sns.scatterplot(x="x", y="y", data=df)
plt.show()

# Coloring points according to BMI category --> hue is categorical feature 
import seaborn as sns
import matplotlib.pyplot as plt
sns.scatterplot(x="x", y="y", hue='BMI_class', data=df)
plt.show()

# Feature Selection I, selecting for feature information

In [None]:
# Import train_test_split()
from sklearn.model_selection import train_test_split

# Select the Gender column as the feature to be predicted (y)
y = ansur_df['Gender']

# Remove the Gender column to create the training data
X = ansur_df.drop('Gender', axis=1)

# Perform a 70% train and 30% test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("{} rows in test set vs. {} in training set. {} Features.".format(X_test.shape[0], X_train.shape[0], X_test.shape[1]))

In [None]:
# Import SVC from sklearn.svm and accuracy_score from sklearn.metrics
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create an instance of the Support Vector Classification class
svc = SVC()

# Fit the model to the training data
svc.fit(X_train, y_train)

# Calculate accuracy scores on both train and test data
accuracy_train = accuracy_score(y_train, svc.predict(X_train))
accuracy_test = accuracy_score(y_test, svc.predict(X_test))

print("{0:.1%} accuracy on test set vs. {1:.1%} on training set".format(accuracy_test, accuracy_train))

Accuracy after dimensionality reduction

You'll reduce the overfit with the help of dimensionality reduction. In this case, you'll apply a rather drastic form of dimensionality reduction by only selecting a single column that has some good information to distinguish between genders. You'll repeat the train-test split, model fit and prediction steps to compare the accuracy on test vs. training data.

In [None]:
# Assign just the 'neckcircumferencebase' column from ansur_df to X
X = ansur_df[['neckcircumferencebase']]

# Split the data, instantiate a classifier and fit the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
svc = SVC()
svc.fit(X_train, y_train)

# Calculate accuracy scores on both train and test data
accuracy_train = accuracy_score(y_train, svc.predict(X_train))
accuracy_test = accuracy_score(y_test, svc.predict(X_test))

print("{0:.1%} accuracy on test set vs. {1:.1%} on training set".format(accuracy_test, accuracy_train))

**Features with little variance**

In [None]:
# Features with little variance 

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=1)
sel.fit(df)
mask = sel.get_support()
print(mask)

reduced_df = df.loc[:, mask]
print(df.shape)

# Create a box plot
df.boxplot()
plt.show()

# Finding a good variance threshold
# Normalize the data
normalized_df = head_df / head_df.mean()
normalized_df.boxplot()
plt.show()
# Print the variances of the normalized data
print(normalized_df.var())
# Note: Pick a value that is lower than the lowest value of 
# a feature you want to keep and higher than the highest value 
# for a feature you want to remove.


#Normalizing the variance before feature selection

from sklearn.feature_selection import VarianceThreshold

# Create a VarianceThreshold feature selector
sel = VarianceThreshold(threshold=0.001)

# Fit the selector to normalized head_df
sel.fit(head_df / head_df.mean())

# Create a boolean mask
mask = sel.get_support()

# Apply the mask to create a reduced dataframe
reduced_df = head_df.loc[:, mask]

print("Dimensionality reduced from {} to {}.".format(head_df.shape[1], reduced_df.shape[1]))

**Features with missing values**

In [None]:
# Features with missing values

# Counting missing values
pokemon_df.isna().sum() / len(pokemon_df)

# Applying a missing value threshold

# Fewer than 30% missing values = True value
mask = pokemon_df.isna().sum() / len(pokemon_df) < 0.3
print(mask)

reduced_df = pokemon_df.loc[:, mask]

**Pairwise Correlation**

In [None]:
# Correlation matrix
df.corr

# Visualizing the correlation matrix
cmap = sns.diverging_palette(h_neg=10,h_pos=240,as_cmap=True)
sns.heatmap(weights_df.corr(), center=0, cmap=cmap, 
            linewidths=1, annot=True, fmt=".2f")


# Removing duplicates

# Create the correlation matrix
corr = ansur_df.corr()

# Generate a mask for the upper triangle 
mask = np.triu(np.ones_like(corr, dtype=bool))

# Add the mask to the heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, linewidths=1, annot=True, fmt=".2f")
plt.show()

**Removing highly correlated features**

In [None]:
# Calculate the correlation matrix and take the absolute value
corr_matrix = ansur_df.corr().abs()

# Create a True/False mask and apply it
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
tri_df = corr_matrix.mask(mask)

# List column names of highly correlated features (r > 0.95)
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.95)]

# Drop the features in the to_drop list
reduced_df = ansur_df.drop(to_drop, axis=1)

print("The reduced_df dataframe has {} columns".format(reduced_df.shape[1]))

# Future Selection II, selecting for model accuracy

**Selecting features for model performance**

In [None]:
# Pre-processing the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

# Creating a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression()
lr.fit(X_train_std, y_train)
X_test_std = scaler.transform(X_test)
y_pred = lr.predict(X_test_std)
print(accuracy_score(y_test, y_pred))

# Inspecting the feature coefcients
print(lr.coef_)

print(dict(zip(X.columns, abs(lr.coef_[0]))))

# Features that contribute little to a model
X.drop('handlength', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
lr.fit(scaler.fit_transform(X_train), y_train)
print(accuracy_score(y_test, lr.predict(scaler.transform(X_test))))

# Recursive Feature Elimination
from sklearn.feature_selection import RFE
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=2, verbose=1)
rfe.fit(X_train_std, y_train)

# Inspecting the RFE results
X.columns[rfe.support_]

print(dict(zip(X.columns, rfe.ranking_)))
print(accuracy_score(y_test, rfe.predict(X_test_std)))

In [None]:
### Build a diabetes classifier #####

# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scale the test features
X_test_std = scaler.transform(X_test)

# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

# Prints accuracy metrics and feature coefficients
print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))


### Manual Recursive Feature Elimination ###

# Remove the feature with the lowest model coefficient
X = diabetes_df[['pregnant', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi', 'family', 'age']]

# Performs a 25-75% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Scales features and fits the logistic regression model
lr.fit(scaler.fit_transform(X_train), y_train)

# Calculates the accuracy on the test set and prints coefficients
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print("{0:.1%} accuracy on test set.".format(acc)) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

### Automatic Recursive Feature Elimination ###

# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

# Fit the eliminator to the data
rfe.fit(X_train, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not eliminated
print(X.columns[rfe.support_])

# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test))
print("{0:.1%} accuracy on test set.".format(acc)) 

**Tree-based Feature Selection**

In [None]:
# Random forest classier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(accuracy_score(y_test, rf.predict(X_test)

# Feature importance values              
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(rf.feature_importances_)
print(sum(rf.feature_importances_)) # ---> 1.0                     

# Feature importance as a feature selector
mask = rf.feature_importances_ > 0.1
print(mask)
                     
X_reduced = X.loc[:, mask]
print(X_reduced.columns)                     
                     
# RFE with random forests
from sklearn.feature_selection import RFE
rfe = RFE(estimator=RandomForestClassifier(),
n_features_to_select=6, verbose=1)
rfe.fit(X_train,y_train) 
                     
print(accuracy_score(y_test, rfe.predict(X_test))
                     
print(X.columns[rfe.support_]                           

In [None]:
### Building a random forest model  ###

# Perform a 75% training and 25% test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Fit the random forest model to the training data
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

# Calculate the test set accuracy
acc = accuracy_score(y_test, rf.predict(X_test))

# Print the importances per feature
print(dict(zip(X.columns, rf.feature_importances_.round(2))))

# Print accuracy
print("{0:.1%} accuracy on test set.".format(acc)) 

### Random Forest for Feature Selection ###

# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Prints out the mask
print(mask)

# Apply the mask to the feature dataset X
reduced_X = X.loc[:, mask]

# prints out the selected column names
print(reduced_X.columns)

### Recursive Feature Elimination with random forests  ####

# Wrap the feature eliminator around the random forest model -- step parameter is important 
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, step=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

# Create a mask using an attribute of rfe
mask = rfe.support_

# Apply the mask to the feature dataset X and print the result
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

**Regularized Linear Regression**

In [None]:
# Linear regression in Python

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
# Actual coefficients = [5 2 0]
print(lr.coef_)
[ 4.95 1.83 -0.05]

# Actual intercept = 20
print(lr.intercept_)

# Calculates R-squared
print(lr.score(X_test, y_test))


# Lasso regressor
from sklearn.linear_model import Lasso
la = Lasso()
la.fit(X_train, y_train)
# Actual coefficients = [5 2 0]
print(la.coef_)

print(la.score(X_test, y_test))

In [None]:
### Creating a LASSO Regressor ###

# Set the test size to 30% to get a 70-30% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit the scaler on the training features and transform these in one go
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

# Create the Lasso model 
la = Lasso()

# Fit it to the standardized training data
la.fit(X_train_std, y_train)

### Lasso Model Results  ###

# Transform the test set with the pre-fitted scaler
X_test_std = scaler.transform(X_test)

# Calculate the coefficient of determination (R squared) on X_test_std
r_squared = la.score(X_test_std, y_test)
print("The model can predict {0:.1%} of the variance in the test set.".format(r_squared))

# Create a list that has True values when coefficients equal 0
zero_coef = la.coef_ == 0

# Calculate how many features have a zero coefficient
n_ignored = sum(zero_coef)
print("The model has ignored {} out of {} features.".format(n_ignored, len(la.coef_)))


### Adjusting the regularization strength ###

# Your current Lasso model has an R2 score of 84.7%. When a model applies overly powerful regularization 
# it can suffer from high bias, hurting its predictive power.
# Improve the balance between predictive power and model simplicity by tweaking the alpha parameter.

# Find the highest alpha value with R-squared above 98%
la = Lasso(alpha=0.1, random_state=0)

# Fits the model and calculates performance stats
la.fit(X_train_std, y_train)
r_squared = la.score(X_test_std, y_test)
n_ignored_features = sum(la.coef_ == 0)

# Print peformance stats 
print("The model can predict {0:.1%} of the variance in the test set.".format(r_squared))
print("{} out of {} features were ignored.".format(n_ignored_features, len(la.coef_)))

**Combining Feature Selectors**

In [None]:
# Lasso Regressor

from sklearn.linear_model import Lasso
la = Lasso(alpha=0.05) # Manually set alpha parameter to find a balance
                       # between removing as much as features as possible
                       # and model accuracy

la.fit(X_train, y_train)
# Actual coefficients = [5 2 0]
print(la.coef_)

print(la.score(X_test, y_test))

# LassoCV Regressor ---> Find Optimal alpha value

from sklearn.linear_model import LassoCV
lcv = LassoCV()
lcv.fit(X_train, y_train)
print(lcv.alpha_)

# LassoCV Regressor ---> Find O coefficients and reduce the feature by masking the dataset 
mask = lcv.coef_ != 0
reduced_X = X.loc[:, mask]

##### Use combination of models for feature selection  ######

# Feature selection with LassoCV 
# The LassoCV class will use cross validation to try out different alpha 
# settings and select the best one. 
from sklearn.linear_model import LassoCV
lcv = LassoCV()
lcv.fit(X_train, y_train)
lcv.score(X_test, y_test)
lcv_mask = lcv.coef_ != 0
sum(lcv_mask) # --- > return 66 features to keep

# Feature selection with random forest
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
rfe_rf = RFE(estimator=RandomForestRegressor(),
n_features_to_select=66, step=5, verbose=1)
rfe_rf.fit(X_train, y_train)
rf_mask = rfe_rf.support_

# Feature selection with gradient boosting
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor
rfe_gb = RFE(estimator=GradientBoostingRegressor(),
n_features_to_select=66, step=5, verbose=1)
rfe_gb.fit(X_train, y_train)
gb_mask = rfe_gb.support_

# Combining the feature selectors
import numpy as np
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)
print(votes)

mask = votes >= 2 # if we want to make sure we don't lose any information,
                  # we could select all features with at least one vote.
                  # Here, we chose to have at least two models voting for
                  # a feature in order to keep it. 
reduced_X = X.loc[:, mask]

In [None]:
#### Creating a LassoCV regressor ####

from sklearn.linear_model import LassoCV

# Create and fit the LassoCV model on the training set
lcv = LassoCV()
lcv.fit(X_train, y_train)
print('Optimal alpha = {0:.3f}'.format(lcv.alpha_))

# Calculate R squared on the test set
r_squared = lcv.score(X_test, y_test)
print('The model explains {0:.1%} of the test set variance'.format(r_squared))

# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_ != 0
print('{} features out of {} selected'.format(sum(lcv_mask), len(lcv_mask)))

### Ensemble models for extra voters  ###
 
# Gradient Boosting Regressor #

from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test, y_test)
print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared))

# Assign the support array to gb_mask
gb_mask = rfe_gb.support_


# Random Forest Regressor #

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

# Select 10 features with RFE on a RandomForestRegressor, drop 3 features on each step
rfe_rf = RFE(estimator=RandomForestRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_rf.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_rf.score(X_test, y_test)
print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared))

# Assign the support array to gb_mask
rf_mask = rfe_rf.support_


# Combining 3 feature selectors # 

# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)
print(votes)

# Create a mask for features selected by all 3 models
meta_mask = votes >= 3
print(meta_mask)

# Apply the dimensionality reduction on X
X_reduced = X.loc[:, meta_mask]
print(X_reduced.columns)

# Plug the reduced dataset into a linear regression pipeline
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
lm.fit(scaler.fit_transform(X_train), y_train)
r_squared = lm.score(scaler.transform(X_test), y_test)
print('The model can explain {0:.1%} of the variance in the test set using {1:} features.'.format(r_squared, len(lm.coef_)))

# Feature Extraction

Calculating new features based on the existing ones while trying to lose as little information as possible. It creates news features, which are in fact combinations of the original ones. 

PCA:

For this technique, it is important to scale the features first, so that their values are easier to compare. 

We can add a reference point to the very center of the point cloud, and then point a vector in the direction of this strongest pattern. We can add a second vector perpendicular to the first one to account for the rest of the variance in this dataset. 

Every point in the dataset could be described by multiplying and then summing two perpendicular vectors. We essentially created a new reference system aligned with the variance in the data. The coordinates that each point has in this new reference system are called principal components, and they are the foundation of principal component analysis (PCA).

**Feature Extraction**

In [None]:
# Future generation -BMI
df_body['BMI'] = df_body['Weight kg'] / df_body['Height m'] ** 2
df_body.drop(['Weight kg','Height m'], axis=1)

# Future generation - averages 
leg_df['leg mm'] = leg_df[['right leg mm','left leg mm']].mean(axis=1)

# Into to PCA
sns.scatterplot(data=df, x='handlength', y='footlength')

scaler = StandardScaler()
df_std = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

**Principal Componenet Analysis**

Principals share no duplicate information and that they are ranked from most to least important. 

In [None]:
# Calculating the principal components

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
std_df = scaler.fit_transform(df)

from sklearn.decomposition import PCA
pca = PCA()
print(pca.fit_transform(std_df))

# Principal component explained variance ratio
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(std_df)
print(pca.explained_variance_ratio_)

# PCA for dimensionality reduction
pca = PCA()
pca.fit(ansur_std_df)
print(pca.explained_variance_ratio_.cumsum())

# Understanding the components
print(pca.components_)

In [None]:
# Create a pairplot to inspect ansur_df
sns.pairplot(ansur_df)

plt.show()

from sklearn.preprocessing import StandardScaler

# Create the scaler and standardize the data
scaler = StandardScaler()
ansur_std = scaler.fit_transform(ansur_df)

# Create the PCA instance and fit and transform the data with pca
pca = PCA()
pc = pca.fit_transform(ansur_std)

# This changes the numpy array output back to a dataframe
pc_df = pd.DataFrame(pc, columns=['PC 1', 'PC 2', 'PC 3', 'PC 4'])

# Create a pairplot of the principal component dataframe
sns.pairplot(pc_df)
plt.show()

In [None]:
# PCA on a larger dataset

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Scale the data
scaler = StandardScaler()
ansur_std = scaler.fit_transform(ansur_df)

# Apply PCA
pca = PCA()
pca.fit(ansur_std)

# Inspect the explained variance ratio per component
print(pca.explained_variance_ratio_)

# Print the cumulative sum of the explained variance ratio
print(pca.explained_variance_ratio_.cumsum())

**PCA applications**

In [None]:
# Understanding the components
print(pca.components_)

# PCA in a pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reducer', PCA())])
pc = pipe.fit_transform(ansur_df)
print(pc[:,:2])

# Checking the effect of categorical features
ansur_categories['PC 1'] = pc[:,0]
ansur_categories['PC 2'] = pc[:,1]
sns.scatterplot(data=ansur_categories, x='PC 1', 
                y='PC 2',hue='Height_class', alpha=0.4)

# PCA in a model pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reducer', PCA(n_components=3)),
    ('classifier', RandomForestClassifier())])
pipe.fit(X_train, y_train)
print(pipe.steps[1])

# PCA in a model pipeline
pipe.steps[1][1].explained_variance_ratio_.cumsum()
print(pipe.score(X_test, y_test))

In [None]:
# Build the pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=2))])

# Fit it to the dataset and extract the component vectors
pipe.fit(poke_df)
vectors = pipe.steps[1][1].components_.round(2)

# Print feature effects
print('PC 1 effects = ' + str(dict(zip(poke_df.columns, vectors[0]))))
print('PC 2 effects = ' + str(dict(zip(poke_df.columns, vectors[1]))))

# PCA for feature exploration
# Build the pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=2))])

# Fit the pipeline to poke_df and transform the data
pc = pipe.fit_transform(poke_df)

print(pc)

# Add the 2 components to poke_cat_df
poke_cat_df['PC 1'] = pc[:, 0]
poke_cat_df['PC 2'] = pc[:, 1]

print(poke_cat_df.head())

# Use the Type feature to color the PC 1 vs PC 2 scatterplot
sns.scatterplot(data=poke_cat_df, 
                x='PC 1', y='PC 2', hue='Type')
plt.show()

### PCA in a model pipeline

# Build the pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reducer', PCA(n_components=2)),
        ('classifier', RandomForestClassifier(random_state=0))])

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Prints the explained variance ratio
print(pipe.steps[1][1].explained_variance_ratio_)

# Score the accuracy on the test set
accuracy = pipe.score(X_test, y_test)

# Prints the model accuracy
print('{0:.1%} test set accuracy'.format(accuracy))

**Principal Component Selection**

In [None]:
# Setting an explained variance threshold

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reducer', PCA(n_components=0.9))])
# Fit the pipe to the data
pipe.fit(poke_df)
print(len(pipe.steps[1][1].components_))

# An optimal number of components --- > 'Elbow' in the plot
pipe.fit(poke_df)
var = pipe.steps[1][1].explained_variance_ratio_
plt.plot(var)
plt.xlabel('Principal component index')
plt.ylabel('Explained variance ratio')
plt.show()

# Compressing images
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('reducer', PCA(n_components=290))])
pipe.fit(X_train)
pc = pipe.fit_transform(X_test)
print(pc.shape)

# Rebuilding images
pc = pipe.transform(X_test)
print(pc.shape)

X_rebuilt = pipe.inverse_transform(pc)
print(X_rebuilt.shape)

img_plotter(X_rebuilt)

In [None]:
# Selecting the proportion of variance to keep
# Pipe a scaler to PCA selecting 80% of the variance
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=0.8))])

# Fit the pipe to the data
pipe.fit(ansur_df)

print('{} components selected'.format(len(pipe.steps[1][1].components_)))


## Choosing the number of components ##

# Pipeline a scaler and pca selecting 10 components
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=10))])

# Fit the pipe to the data
pipe.fit(ansur_df)

# Plot the explained variance ratio
plt.plot(pipe.steps[1][1].explained_variance_ratio_)

plt.xlabel('Principal component index')
plt.ylabel('Explained variance ratio')
plt.show()

## PCA for image compression ## 

# Plot the MNIST sample
plot_digits(X_test)


# Transform the input data to principal components
pc = pipe.transform(X_test)

# Prints the number of features per dataset
print("X_test has {} features".format(X_test.shape[1]))
print("pc has {} features".format(pc.shape[1]))

# Inverse transform the components to original feature space
X_rebuilt = pipe.inverse_transform(pc)

# Prints the number of features
print("X_rebuilt has {} features".format(X_rebuilt.shape[1]))


# Transform the input data to principal components
pc = pipe.transform(X_test)

# Inverse transform the components to original feature space
X_rebuilt = pipe.inverse_transform(pc)

# Plot the reconstructed data
plot_digits(X_rebuilt)