# Feature Engineering:

## Imports:

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
pd.set_option('display.max_columns', None)

In [None]:
trainDf= pd.read_csv('../RawData/train.csv',index_col='id')
testDf= pd.read_csv('../RawData/test.csv',index_col='id')

In [None]:
display(trainDf)
display(testDf)

In [None]:
targets= trainDf['class']
trainDf= trainDf.drop('class',axis=1)
display(targets)
display(trainDf)

## Binarize targets:

In [None]:
targets=targets.apply(lambda x: 1 if x=='p' else 0)
display(targets)

In [None]:
targets.name='class_is_p'
display(targets)

## Check for any Nulls in targets:

In [None]:
display(targets.isna().any())

## Check for nulls in training data:

In [None]:
display(trainDf.isna().any())

In [None]:
display(trainDf.info())

In [None]:
display(testDf.info())

In [None]:
class FeatureEngineerPipeline:
    def __init__(self, numericImputer,catImputer,numericScaler,catEncoder,numericColsArr, catColsArr, PCA=None):
        self.PCA=PCA

        self.numericPipeline= Pipeline([('imputer',numericImputer),('scaler',numericScaler),])
        self.catPipeline= Pipeline([('imputer',catImputer),('encoder',catEncoder)])
        self.CT= ColumnTransformer([
            ('numericPipeline',self.numericPipeline,numericColsArr),
            ('catPipeline',self.catPipeline,catColsArr)
        ])
        
        self.PCAfitted= False
    
    def fit(self,X):
        
        self.CT.fit(X)
        self.PCAfitted=False
    
    def transform(self,X):
        X_new= self.CT.transform(X)
        if (self.PCA is not None) and (not self.PCAfitted):
            self.PCAfitted =True
            return self.PCA.fit_transform(X_new)
        elif (self.PCA is not None) and (self.PCAfitted):
            return self.PCA.transform(X_new)
        else:
            return X_new
    
    def fit_transform(self,X):
        X_new= self.CT.fit_transform(X)
        if self.PCA is not None:
            self.PCA.fit_transform(X_new)
        else:
            return X_new
    
    def featureEngineer(self, df):
        new_df=df
        good_values = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 
               'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', np.nan, 'error', ]
        
        for col in new_df.columns:
            if col in ['cap-diameter','stem-height','stem-width']:
                continue
            new_df[col]= new_df[col].apply(lambda x: 'error' if not(x in good_values) else x)
        
        for col in new_df.columns:            
            if new_df[col].isna().any():
                new_df[f"{col}_is_null"]= new_df[col].isna().apply(lambda x: 1 if x==True else 0)
        
        return new_df
            

In [None]:
numericCols= ['cap-diameter','stem-height','stem-width']
catCols= [x for x in trainDf if x not in numericCols]
catImputer= SimpleImputer(strategy='most_frequent')#IterativeImputer(initial_strategy='most_frequent')
numericImputer= IterativeImputer(initial_strategy='median')
catEncoder=OneHotEncoder(handle_unknown='infrequent_if_exist',min_frequency=1, drop='first')
numericScaler= RobustScaler()
# pca= PCA(n_components=25)
pipe= FeatureEngineerPipeline(numericImputer=numericImputer,catImputer=catImputer,
                              numericScaler=numericScaler,catEncoder=catEncoder,
                              numericColsArr=numericCols,catColsArr=catCols)#PCA=pca
trainDf_FE= pipe.featureEngineer(trainDf)
testDf_FE= pipe.featureEngineer(testDf)
X_train= pipe.fit_transform(trainDf_FE)
X_test= pipe.transform(testDf_FE)

In [None]:

X_train=X_train.toarray().astype(np.float32)
X_test=X_test.toarray().astype(np.float32)

np.save('X_train.npy',X_train)
np.save('X_test.npy',X_test)


In [None]:
np.save('y_train',targets.to_numpy(dtype=np.float32))