## Preparation

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression, RFE
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils import resample
from itertools import combinations

In [None]:
df = pd.read_csv('loremipsum')
print(df.info())

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns

In [None]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(df[categorical_columns])

In [None]:
enc_col_names = ohe.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_data, columns=enc_col_names)
display(encoded_df.head())

In [None]:
num_data = df.select_dtypes(include=['int64', 'float64'])
final_data= pd.concat([num_data,  encoded_df], axis = 1)

print("Shape of final dataset:", final_data.shape)
print(final_data.head())

In [None]:
#colname is a placeholder
X= final_data.drop('colname', axis=1)
y= final_data['colname']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, randomm_state=60)

## Feature Selection

In [None]:
corr_with_target = X_train.corrwith(y_train).abs().sort_values(ascending=False)
print(corr_with_target.head(10))

In [None]:
#Feature Importance from Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=60)
rf.fit(X_train, y_train)

fi_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(fi_df.head(10))

## PCA

In [None]:
pca_df = final_data.copy()

X_pca = pca_df.drop('colname', axis=1)
y_pca = pca_df['colname']

In [None]:
scale = StandardScaler()
X_scaled= scale.fit_transform(X_pca)

print(X_scaled.shape)

In [2]:
pca  = PCA()
X_pca = pca.fit_transform(X_scaled)

cum_var_rat = np.cumsum(pca.explained_variance_ratio_)

NameError: name 'X_scaled' is not defined

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1, len(cum_var_rat)+1), cum_var_rat, 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True)
plt.show()

In [None]:
n_components_95 = np.argmax(cum_var_rat >= 0.95) + 1
print(n_components_95)

pca_95 = PCA(n_components_95)
X_pca_95 = pca_95.fit_transform(X_scaled)

print(X_pca_95.shape)

In [None]:
feat_names = X.columns

loadings = pd.DataFrame(
    pca_95.components_.T,
    columns = [f'PC{i+1}' for i in range(n_components_95)],
    index = feat_names
)

for  i in range(3):
    print(loadings[f'PC{i+1}'].abs().sort_values(ascending=False).head(10))

In [None]:
plt.figure(figsize=(10,8))
scatter = plt.scatter(X_pca_95[:,0], X_pca_95[:,1],  c=y, cmap='viridis')
#identifyhere is a placeholder
plt.colorbar(scatter, label='identifyhere')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show

In [None]:
pca_df = pd.DataFrame(X_pca_95, columns = [f'PC{i+1}' for  i in range(n_components_95)])
pca_df['colname'] = y

correlations = pca_df.corr()['colname'].abs().sort_values(ascending=False)
print(correlations.head(10))

## Mutual Information

In [None]:
from sklearn.feature_selection import mutual_info_regression

mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
print(mi_scores)

## Stability Selection

In [None]:
from sklearn.utils import resample

def stability_selection(X, y, n_iterations=100, sample_fraction=0.75, n_estimators=100):
    feature_importances = np.zeros(X.shape[1])
    for _ in range(n_iterations):
        X_sample, y_sample = resample(X, y, n_samples=int(len(X) * sample_fraction))
        rf = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
        rf.fit(X_sample, y_sample)
        feature_importances += rf.feature_importances_
    return pd.Series(feature_importances / n_iterations, index=X.columns).sort_values(ascending=False)

stab_scores = stability_selection(X, y)
print(stab_scores)