# Column_Transformer

#### Import the required libraries

In [1]:
import pandas as pd

  from pandas.core import (


#### Load the dataset

In [2]:
data = pd.read_csv("http://bit.ly/kaggletrain",nrows=6)


In [3]:
columns = ['Embarked','Sex', 'Age','Fare']
features =  data[columns]

In [4]:
features

Unnamed: 0,Embarked,Sex,Age,Fare
0,S,male,22.0,7.25
1,C,female,38.0,71.2833
2,S,female,26.0,7.925
3,S,female,35.0,53.1
4,S,male,35.0,8.05
5,Q,male,,8.4583


In [5]:
# The 'SimpleImputer' is a class from the 'sklearn.impute' module in scikit-learn. 
# It provides a simple strategy for imputing (i.e., filling in) missing values in a dataset.

#### Import all the required sklearn libraries

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

In [7]:
ohe = OneHotEncoder()
si = SimpleImputer()

In [8]:
ct = make_column_transformer(
    (ohe, ['Embarked', 'Sex']),  # apply OneHotEncoder to Embarked and Sex
    (si, ['Age']),              # apply SimpleImputer to Age
    remainder='passthrough')     # include remaining column (Fare) in the output

In [22]:
transformed_col = ct.fit_transform(features)

In [23]:
transformed_col

array([[ 0.    ,  0.    ,  1.    ,  0.    ,  1.    , 22.    ,  7.25  ],
       [ 1.    ,  0.    ,  0.    ,  1.    ,  0.    , 38.    , 71.2833],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    , 26.    ,  7.925 ],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    , 35.    , 53.1   ],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    , 35.    ,  8.05  ],
       [ 0.    ,  1.    ,  0.    ,  0.    ,  1.    , 31.2   ,  8.4583]])

These was all about column transformer so far till here

#### Get the feature names for one-hot encoded columns

In [38]:
ohe_categories = ct.named_transformers_['onehotencoder'].categories_
ohe_feature_names = list(ohe_categories[0]) + list(ohe_categories[1])

In [39]:
ohe_feature_names

['C', 'Q', 'S', 'female', 'male']

In [40]:
column_names = ohe_feature_names + ['Age','Fare']

In [41]:
column_names

['C', 'Q', 'S', 'female', 'male', 'Age', 'Fare']

In [42]:
transformed_data = pd.DataFrame(transformed_col, columns = column_names)

In [43]:
transformed_data

Unnamed: 0,C,Q,S,female,male,Age,Fare
0,0.0,0.0,1.0,0.0,1.0,22.0,7.25
1,1.0,0.0,0.0,1.0,0.0,38.0,71.2833
2,0.0,0.0,1.0,1.0,0.0,26.0,7.925
3,0.0,0.0,1.0,1.0,0.0,35.0,53.1
4,0.0,0.0,1.0,0.0,1.0,35.0,8.05
5,0.0,1.0,0.0,0.0,1.0,31.2,8.4583


In [44]:
transformed_data[['female', 'male', 'C', 'Q', 'S']] = transformed_data[['female', 'male', 'C', 'Q', 'S']].astype(int)

In [45]:
final_data = pd.concat([data.drop(columns = columns), transformed_data], axis=1)

In [46]:
final_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Cabin,C,Q,S,female,male,Age,Fare
0,1,0,3,"Braund, Mr. Owen Harris",1,0,A/5 21171,,0,0,1,0,1,22.0,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,C85,1,0,0,1,0,38.0,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,,0,0,1,1,0,26.0,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,C123,0,0,1,1,0,35.0,53.1
4,5,0,3,"Allen, Mr. William Henry",0,0,373450,,0,0,1,0,1,35.0,8.05
5,6,0,3,"Moran, Mr. James",0,0,330877,,0,1,0,0,1,31.2,8.4583


Now the data is in numeric form, so now you can remove the irrelevant column and do the data pre-processing before feeding this data to Machine Learning Model