In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv('datasets/covidtoy.csv')
data.head(7)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
5,84,Female,,Mild,Bangalore,Yes
6,14,Male,101.0,Strong,Bangalore,No


In [3]:
print(data.cough.value_counts())
print()
print(data.city.value_counts())

Mild      62
Strong    38
Name: cough, dtype: int64

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64


In [4]:
data.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [5]:
ct = ColumnTransformer(transformers=[
    ('c1', OrdinalEncoder(categories=[['Mild','Strong']]), ['cough']),
    ('c2',SimpleImputer(strategy='mean'),['fever']),
    ('c3',OneHotEncoder(drop=None, sparse=False), ['gender','city']),
    
], remainder='passthrough')
ct.fit_transform(data)[0]

array([0.0, 103.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 60, 'No'], dtype=object)

In [6]:
ct_2 = ColumnTransformer(transformers=[
    ('c1', OrdinalEncoder(categories=[['Mild','Strong']]), [3]),
    ('c2',SimpleImputer(strategy='mean'),[2]),
    ('c3',OneHotEncoder(drop=None), [1,4]),
    
], remainder='passthrough')
ct_2.fit_transform(data)[0]

array([0.0, 103.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 60, 'No'], dtype=object)

In [7]:
data.head(1)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No


# Second dataset 

In [8]:
data_second=pd.read_csv('datasets/datascience_jobdata.csv')

In [9]:
data_second.drop(['enrollee_id','city','city_development_index','company_size','company_type'],axis='columns',inplace=True)

In [10]:
data_second.isnull().sum()

gender                 4508
relevent_experience       0
enrolled_university     386
education_level         460
major_discipline       2813
experience               65
training_hours          766
target                    0
dtype: int64

In [38]:
X = data_second.drop('target', axis='columns')
X.head(1)

Unnamed: 0,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,training_hours
0,Male,Has relevent experience,no_enrollment,Graduate,STEM,20.0,36.0


In [60]:
ct_b = ColumnTransformer(transformers=[
    ('ctb1', SimpleImputer(strategy='most_frequent'),['gender','enrolled_university','education_level',
                                                      'major_discipline']),
    ('ctb2',SimpleImputer(strategy='mean'),['experience','training_hours'])
], remainder='passthrough')
# ct_b.fit_transform(X)[0]

We will get an error!!!!!
The important point we must know is, columntransformer returns ndarray. As we will be assigning two columntransformers inside a pipeline later, 1st columntransformer's output will be 2nd's input. If we give name of the column directly, we will get below error. <br> ValueError: Specifying the columns using strings is only supported for pandas DataFrames.
<br> I did a same mistake and learned from this :D While making ct_c, I passed name of the column transformer but later, I have used the index value mapping a position of each column. Yes! this looks tedious...

In [46]:
# ct_c = ColumnTransformer(transformers=[
#     ('ctc1',OneHotEncoder(handle_unknown='ignore',sparse=False),['gender','relevent_experience','enrolled_university', 'education_level',
#                                                    'major_discipline'])
# ],remainder='passthrough')


In [61]:
ct_c = ColumnTransformer(transformers=[
    ('ctc1',OneHotEncoder(handle_unknown='ignore',sparse=False),[0,1,2,3,6])
],remainder='passthrough')

In [62]:
pipe = Pipeline(steps=[
    ('p1', ct_b),
    ('p2', ct_c)
])

In [63]:
pipe.fit_transform(X)[0]

array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 20.0, 36.0], dtype=object)