In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import os
from typing import Tuple
from typing_extensions import Annotated 

from mlProject import logger

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../artifacts/data_ingestion/train.csv')

In [3]:
pd.set_option("display.max_columns", 120)
pd.set_option("display.max_rows", 120)
pd.set_option('display.float_format', '{:.2f}'.format)

In [4]:
class FeatureEngineering(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Transform the data by applying feature engineering.
        
        Args:
            X (pd.DataFrame): The data to transform.
        Returns:
            pd.DataFrame: The transformed data.
        """
        try:
            # Atomic weight and ionization energy ratio
            X["atomicweight_ionenergy_Ratio"] = X["atomicweight_Average"] / (X["ionenergy_Average"] + 0.0000001)

            # Normalized density with respect to the total number of electrons
            X["normalized_density"] = X["density_Total"] / (X["allelectrons_Total"] + 0.0000001)
            
            # Electronegativity and Van der Waals radius ratio
            X["el_neg_chi_R_vdw_Ratio"] = X["el_neg_chi_Average"] / (X["R_vdw_element_Average"] + 0.0000001)
            
            # Number of Electrons Based on Average Atomic Weight
            X["electrons_per_atomicweight"] = X["allelectrons_Average"] / (X["atomicweight_Average"] + 0.0000001)

            # Valence Electron Count
            X["specific_electron_count"] = X["allelectrons_Total"] / (X["atomicweight_Average"] + 0.0000001)

            return X

        except Exception as e:
            logger.error(f"Feature engineering failed with the following error: {e}")
            raise e

In [5]:
class FixOutliers(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Transform the data by fixing outliers.
        
        Args:
            X (pd.DataFrame): The data to transform.
        Returns:
            pd.DataFrame: The transformed data.
        """
        try:
            # Fixing outliers
            # Note: The following outliers were hard-coded and removed using Tukey's method
            outlier_columns = [
                "allelectrons_Total", "atomicweight_ionenergy_Ratio", "normalized_density", 
                "el_neg_chi_R_vdw_Ratio", "electrons_per_atomicweight", "specific_electron_count"
            ]
            
            for column in outlier_columns:
                IQR = X[column].quantile(0.75) - X[column].quantile(0.25)
                Lower_fence = X[column].quantile(0.25) - (IQR * 3)
                Upper_fence = X[column].quantile(0.75) + (IQR * 3)
            
                X = X[X[column] <= Upper_fence]

            return X

        except Exception as e:
            logger.error(f"Fixing outliers failed with the following error: {e}")
            raise e

In [6]:
class DataPreprocessing(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Transform the data by applying data preprocessing.
        
        Args:
            X (pd.DataFrame): The data to transform.
        Returns:
            pd.DataFrame: The transformed data.
        """
        try:
            # Data preprocessing
            X_train = X.drop(columns=["id"], axis=1)

            num_cols = X_train.select_dtypes(include=np.number).columns.to_list()
            print(num_cols)
            
            scaler = MinMaxScaler()
            matrix = scaler.fit_transform(X_train[num_cols])

            X_train = pd.DataFrame(matrix, columns=num_cols)
            
            return X_train

        except Exception as e:
            logger.error(f"Data preprocessing failed with the following error: {e}")
            raise e

In [7]:
df = pd.read_csv('../artifacts/data_ingestion/train.csv')

In [8]:
df.describe()

Unnamed: 0,id,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average,Hardness
count,10407.0,10407.0,10407.0,10407.0,10407.0,10407.0,10407.0,10407.0,10407.0,10407.0,10407.0,10407.0,10407.0
mean,5203.0,128.05,14.49,17.03,4.55,37.51,10.94,2.61,1.73,0.94,0.49,2.13,4.65
std,3004.39,224.12,15.97,10.47,0.69,26.01,1.41,0.33,0.19,0.18,0.06,1.94,1.68
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2601.5,68.0,7.56,10.0,4.0,20.3,10.59,2.53,1.67,0.86,0.48,0.81,3.0
50%,5203.0,100.0,10.65,12.6,4.71,26.2,11.2,2.71,1.73,0.92,0.49,1.35,5.5
75%,7804.5,131.0,16.68,22.0,4.8,48.72,11.67,2.81,1.8,0.98,0.5,2.74,6.0
max,10406.0,15300.0,643.09,67.0,6.0,167.4,15.25,3.44,2.25,1.62,0.83,10.97,10.0


In [9]:
fe = FeatureEngineering()

In [11]:
X = df.drop(columns=['Hardness'],  axis=1)
y = df['Hardness']

In [19]:
X_train

NameError: name 'X_train' is not defined

In [13]:
from sklearn.pipeline import Pipeline

In [20]:
pipe = Pipeline([
    ("Feature Engineering", FeatureEngineering()),
    ("Fix Outliers", FixOutliers()),
    ("Data Preprocessing", DataPreprocessing()),
])

In [21]:
pipe

In [22]:
df

Unnamed: 0,id,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average,Hardness
0,0,100.00,0.84,10.00,4.80,20.61,11.09,2.77,1.73,0.86,0.50,0.91,6.00
1,1,100.00,7.56,10.00,4.80,20.30,12.04,2.75,1.63,0.91,0.49,0.72,6.50
2,2,76.00,8.89,15.60,5.60,33.74,12.09,2.83,1.79,0.86,0.48,1.51,2.50
3,3,100.00,8.80,10.00,4.80,20.21,10.95,2.65,1.63,0.94,0.49,0.79,6.00
4,4,116.00,9.58,11.60,4.80,24.99,11.82,2.77,1.68,0.90,0.49,1.86,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10402,10402,128.00,7.56,12.00,4.00,26.39,11.33,2.64,1.63,0.89,0.50,1.80,4.00
10403,10403,30.00,1.74,10.00,5.33,20.77,14.16,3.09,1.56,0.87,0.48,0.81,5.00
10404,10404,196.00,30.92,24.50,5.50,53.49,10.07,2.29,1.54,1.12,0.47,2.12,1.80
10405,10405,38.00,1.55,12.67,4.67,26.62,11.29,2.74,1.76,0.98,0.49,0.78,6.00


In [23]:
pipe.fit_transform(X, y)

['allelectrons_Total', 'density_Total', 'allelectrons_Average', 'val_e_Average', 'atomicweight_Average', 'ionenergy_Average', 'el_neg_chi_Average', 'R_vdw_element_Average', 'R_cov_element_Average', 'zaratio_Average', 'density_Average', 'atomicweight_ionenergy_Ratio', 'normalized_density', 'el_neg_chi_R_vdw_Ratio', 'electrons_per_atomicweight', 'specific_electron_count']


Unnamed: 0,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average,atomicweight_ionenergy_Ratio,normalized_density,el_neg_chi_R_vdw_Ratio,electrons_per_atomicweight,specific_electron_count
0,0.32,0.01,0.20,0.80,0.16,0.73,0.80,0.77,0.55,0.60,0.08,0.15,0.02,0.70,0.80,0.36
1,0.32,0.08,0.20,0.80,0.16,0.79,0.80,0.72,0.59,0.60,0.07,0.14,0.19,0.74,0.81,0.37
2,0.24,0.09,0.31,0.93,0.27,0.79,0.82,0.79,0.56,0.58,0.14,0.22,0.29,0.69,0.76,0.17
3,0.32,0.09,0.20,0.80,0.16,0.72,0.77,0.72,0.60,0.59,0.07,0.15,0.22,0.71,0.81,0.37
4,0.37,0.10,0.23,0.80,0.20,0.78,0.80,0.75,0.58,0.60,0.17,0.17,0.21,0.72,0.76,0.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,0.41,0.08,0.24,0.67,0.21,0.74,0.77,0.72,0.58,0.60,0.16,0.19,0.15,0.71,0.75,0.36
9527,0.10,0.02,0.20,0.89,0.16,0.93,0.90,0.69,0.56,0.58,0.07,0.12,0.15,0.87,0.79,0.11
9528,0.62,0.31,0.48,0.92,0.42,0.66,0.67,0.69,0.72,0.57,0.19,0.43,0.40,0.65,0.75,0.27
9529,0.12,0.02,0.25,0.78,0.21,0.74,0.80,0.78,0.63,0.59,0.07,0.19,0.10,0.68,0.78,0.11
