In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Data Collection

In [2]:
filepath_iris= r"PART1_EDA_IRIS.csv"

In [3]:
iris = pd.read_csv(filepath_iris , header = 0 , index_col= 0 )

In [4]:
iris.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
iris.shape

(147, 5)

## STEP 2 : Package importing 

In [6]:
from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler

# from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline                
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

## STEP 3 : Pipeline

In [7]:
df = iris.copy()

In [8]:
def num_cat_col_list(DF) : 
    
    from pandas.api.types import is_string_dtype, is_numeric_dtype

    
    num_list = []
    cat_list = []

    for column in DF:

        if is_numeric_dtype(DF[column]):
            num_list.append(column)

        elif is_string_dtype(DF[column]):
            cat_list.append(column)

    return (num_list , cat_list)

In [9]:
num_list , cat_list = num_cat_col_list(DF = df)

print("Numerical column : ",num_list)
print()
print("Categorical column : ",cat_list)

Numerical column :  ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

Categorical column :  ['Species']


In [10]:
# num_list

num_feat = num_list.copy()

cat_feat = cat_list.copy()

In [11]:
def target_encode(ytrain,ytest):
    
    
    ytrain.replace(to_replace = ['setosa', 'versicolor' , 'virginica'] , 
                   value = [0,1,2] , inplace = True)
    
    ytest.replace(to_replace = ['setosa', 'versicolor' , 'virginica'] , 
                  value = [0,1,2] , inplace = True)
    
    return ytrain , ytest

In [12]:
def pipeline_trans(DF,num_feat ,cat_feat = None):
    
    # Separating Independent and dependent variables
    X  = DF.drop(labels = ['Species' ], axis = 1)
    y  = DF['Species']
    
    # Train-Test Split
    X_train,X_test,y_train,y_test = train_test_split(X,y , random_state= 100 ,test_size = 0.2 ,
                                                     stratify = y)
    
    # Target Encoding
    y_train , y_test = target_encode(ytrain = y_train, ytest = y_test)
    
    # Pipeline
    
    num_pipeline = Pipeline([('Imputer' , SimpleImputer(strategy='median'))                       
                            ])
    
    pipeline = ColumnTransformer([("Num pipeline" , num_pipeline , num_feat)
                                
                                ])
    
    # Fit and Transformation
    X_train_tr = pipeline.fit_transform(X_train)
    X_test_tr =  pipeline.transform(X_test)
    
    # Output Columns after tranformation
    out_cols = []

    i= 0  # counter

    for col in pipeline.get_feature_names_out():
        out_cols.append(col.split("__")[-1])
        #out_cols[i] = out_cols[i].split("_")[-1]

        i += 1
    # print(out_cols)
    
    # Conversion to train and test Dataframes
    X_train_tr_df = pd.DataFrame(X_train_tr, columns= out_cols)
    X_test_tr_df = pd.DataFrame(X_test_tr , columns = out_cols)
    
    y_train.reset_index(drop = True , inplace = True)
    y_test.reset_index(drop = True , inplace = True)
    
    df_train_trans = pd.concat([X_train_tr_df,y_train] ,axis = 1 )
    df_test_trans = pd.concat([X_test_tr_df,y_test] ,axis = 1 )
    
    return df_train_trans,df_test_trans

In [13]:
df_train_tr ,df_test_tr = pipeline_trans(DF = df , num_feat = num_feat )

In [14]:
df_train_tr.head(3)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,6.4,3.2,5.3,2.3,2
1,4.9,2.4,3.3,1.0,1
2,5.9,3.0,5.1,1.8,2


In [15]:
df_test_tr.head(3)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.4,3.0,4.5,1.5,1
1,5.6,3.0,4.1,1.3,1
2,6.3,2.8,5.1,1.5,2


## STEP 4 : Export the DF as a .csv file.

In [16]:
D_train = df_train_tr.copy()
D_test = df_test_tr.copy()

In [17]:
D_train.to_csv(r"S1_part2_pipeline_Train_DS.csv")
D_test.to_csv(r"S1_part2_pipeline_Test_DS.csv")