# Feature Engineering Notebook

## Load Dataset

In [1]:
import pandas as pd
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,marketCap,timestamp,crypto_name,date
0,0,112.900002,118.800003,107.142998,115.910004,0.0,1288693000.0,2013-05-05T23:59:59.999Z,Bitcoin,2013-05-05
1,1,3.49313,3.69246,3.34606,3.59089,0.0,62298190.0,2013-05-05T23:59:59.999Z,Litecoin,2013-05-05
2,2,115.980003,124.663002,106.639999,112.300003,0.0,1249023000.0,2013-05-06T23:59:59.999Z,Bitcoin,2013-05-06
3,3,3.59422,3.78102,3.11602,3.37125,0.0,58594360.0,2013-05-06T23:59:59.999Z,Litecoin,2013-05-06
4,4,112.25,113.444,97.699997,111.5,0.0,1240594000.0,2013-05-07T23:59:59.999Z,Bitcoin,2013-05-07


## Missing Value Summary

In [2]:
df.isnull().sum()

Unnamed: 0     0
open           0
high           0
low            0
close          0
volume         0
marketCap      0
timestamp      0
crypto_name    0
date           0
dtype: int64

## Impute Numeric Missing Values (Median)

In [3]:
numeric = df.select_dtypes(include=['number']).columns
df[numeric] = df[numeric].fillna(df[numeric].median())

## Impute Categorical Missing Values (Mode)

In [4]:
categorical = df.select_dtypes(include=['object']).columns
for c in categorical: df[c] = df[c].fillna(df[c].mode()[0])

## Label Encoding Categorical Columns

In [6]:
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.0 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.0 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.0 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.0 MB 560.1 kB/s e


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for c in categorical: df[c] = le.fit_transform(df[c])

## One-Hot Encoding (Optional)

In [8]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,marketCap,timestamp,crypto_name,date
0,0,112.900002,118.800003,107.142998,115.910004,0.0,1288693000.0,0,8,0
1,1,3.49313,3.69246,3.34606,3.59089,0.0,62298190.0,0,30,0
2,2,115.980003,124.663002,106.639999,112.300003,0.0,1249023000.0,1,8,1
3,3,3.59422,3.78102,3.11602,3.37125,0.0,58594360.0,1,30,1
4,4,112.25,113.444,97.699997,111.5,0.0,1240594000.0,2,8,2


## Feature Scaling (StandardScaler)

In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df[df.columns] = sc.fit_transform(df[df.columns])

## Handling Outliers (IQR-cap)

In [10]:
Q1=df.quantile(0.25); Q3=df.quantile(0.75); IQR=Q3-Q1
for col in df.select_dtypes(include='number'):
 df[col]=df[col].clip(Q1[col]-1.5*IQR[col], Q3[col]+1.5*IQR[col])

## Creating Interaction Features

In [11]:
import itertools
num_cols=df.select_dtypes(include='number').columns
for a,b in itertools.combinations(num_cols[:5],2): df[f'{a}_x_{b}']=df[a]*df[b]

## Creating Polynomial Features

In [12]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_df = pd.DataFrame(poly.fit_transform(df.select_dtypes(include='number')))
poly_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,220,221,222,223,224,225,226,227,228,229
0,-1.732027,-0.153924,-0.153329,-0.154019,-0.153934,-0.229533,-0.179447,-2.502156,-1.331232,-2.502156,...,0.000561,0.00056,0.000559,0.000562,0.000558,0.000557,0.00056,0.000557,0.00056,0.000562
1,-1.73198,-0.165666,-0.165362,-0.165516,-0.165736,-0.229533,-0.195797,-2.502156,0.090683,-2.502156,...,0.000754,0.000751,0.000752,0.000753,0.000749,0.00075,0.000751,0.000751,0.000752,0.000753
2,-1.731932,-0.153924,-0.153329,-0.154019,-0.153934,-0.229533,-0.179976,-2.502156,-1.331232,-2.502156,...,0.000561,0.00056,0.000559,0.000562,0.000558,0.000557,0.00056,0.000557,0.00056,0.000562
3,-1.731885,-0.165647,-0.165346,-0.165561,-0.165778,-0.229533,-0.195846,-2.502156,0.090683,-2.502156,...,0.000754,0.000752,0.000753,0.000754,0.000749,0.00075,0.000751,0.000751,0.000752,0.000753
4,-1.731837,-0.153924,-0.153329,-0.154019,-0.153934,-0.229533,-0.180088,-2.502156,-1.331232,-2.502156,...,0.000561,0.00056,0.000559,0.000562,0.000558,0.000557,0.00056,0.000557,0.00056,0.000562


## Log Transform of Skewed Columns

In [13]:
import numpy as np
skewed = df.skew()[df.skew()>1].index
for col in skewed: df[col] = np.log1p(df[col])

## Binning Numerical Features

In [14]:
for col in df.select_dtypes(include='number').columns[:3]: df[f'{col}_bin']=pd.cut(df[col], bins=5, labels=False)

## Feature Selection - Variance Threshold

In [15]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.0)
df_sel = sel.fit_transform(df)
df_sel[:5]

array([[-1.73202706, -0.16714571, -0.16644258, -0.16725885, -0.16715796,
        -0.26075852, -0.19777703, -2.50215616, -1.33123165, -2.50215616,
         0.26659998,  0.26556924,  0.26676577,  0.26661793,  0.0236009 ,
         0.02370723,  0.02369409,  0.02361557,  0.02360249,  0.02370883,
         0.        ,  4.        ,  4.        ],
       [-1.73197957, -0.18112149, -0.18075733, -0.18094174, -0.18120486,
        -0.26075852, -0.21790321, -2.50215616,  0.09068269, -2.50215616,
         0.28693014,  0.28640382,  0.28667038,  0.28705062,  0.02739488,
         0.02742038,  0.02745675,  0.02737008,  0.02740639,  0.02743189,
         0.        ,  0.        ,  0.        ],
       [-1.73193209, -0.16714571, -0.16644258, -0.16725885, -0.16715796,
        -0.26075852, -0.19842175, -2.50215616, -1.33123165, -2.50215616,
         0.26658536,  0.26555467,  0.26675114,  0.26660331,  0.0236009 ,
         0.02370723,  0.02369409,  0.02361557,  0.02360249,  0.02370883,
         0.        ,  4.    

## Feature Selection - Mutual Information

In [16]:
from sklearn.feature_selection import mutual_info_classif
# replace target
# mi = mutual_info_classif(df.drop('target', axis=1), df['target'])

## PCA for Dimensionality Reduction

In [17]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
pca_features = pca.fit_transform(df)
pca_features[:5]

array([[ 1.98938889, -5.82953317, -0.74287348,  1.18581596,  0.02921576],
       [-3.17717787, -3.2188135 , -0.02963653,  1.18610527,  0.0289171 ],
       [ 1.98940075, -5.82949293, -0.74287511,  1.18582551,  0.02927339],
       [-3.17716036, -3.21877721, -0.02963655,  1.18611872,  0.02898893],
       [ 1.98942011, -5.82945137, -0.74287534,  1.18584025,  0.02935327]])

## Save Final Feature Matrix

In [18]:
df.to_csv('final_feature_engineered.csv', index=False)
df.head()

Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,marketCap,timestamp,crypto_name,date,...,Unnamed: 0_x_close,open_x_high,open_x_low,open_x_close,high_x_low,high_x_close,low_x_close,Unnamed: 0_bin,open_bin,high_bin
0,-1.732027,-0.167146,-0.166443,-0.167259,-0.167158,-0.260759,-0.197777,-2.502156,-1.331232,-2.502156,...,0.266618,0.023601,0.023707,0.023694,0.023616,0.023602,0.023709,0,4,4
1,-1.73198,-0.181121,-0.180757,-0.180942,-0.181205,-0.260759,-0.217903,-2.502156,0.090683,-2.502156,...,0.287051,0.027395,0.02742,0.027457,0.02737,0.027406,0.027432,0,0,0
2,-1.731932,-0.167146,-0.166443,-0.167259,-0.167158,-0.260759,-0.198422,-2.502156,-1.331232,-2.502156,...,0.266603,0.023601,0.023707,0.023694,0.023616,0.023602,0.023709,0,4,4
3,-1.731885,-0.181098,-0.180738,-0.180996,-0.181255,-0.260759,-0.217965,-2.502156,0.090683,-2.502156,...,0.287108,0.027389,0.027425,0.02746,0.027375,0.027411,0.027446,0,0,0
4,-1.731837,-0.167146,-0.166443,-0.167259,-0.167158,-0.260759,-0.198559,-2.502156,-1.331232,-2.502156,...,0.266589,0.023601,0.023707,0.023694,0.023616,0.023602,0.023709,0,4,4
