In [1]:
import numpy as np
import pandas as pd


In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [3]:
df=pd.read_csv('covid_toy.csv')

In [4]:
df['city'].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [5]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],
                                                test_size=0.2)

In [8]:
transformer=ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [9]:
X_train_transform=transformer.fit_transform(X_train)

In [10]:
X_test_transform=transformer.transform(X_test)

In [11]:
ohe = transformer.named_transformers_['tnf3']
ohe_cols = ohe.get_feature_names_out(['gender', 'city'])

In [12]:
ohe_cols

array(['gender_Male', 'city_Delhi', 'city_Kolkata', 'city_Mumbai'],
      dtype=object)

In [13]:
new_columns = ['fever', 'cough'] + list(ohe_cols) + \
#/ this is just the continutaion symbol 
              [col for col in X_train.columns if col not in ['fever', 'cough', 'gender', 'city']]

In [14]:
X_train_df = pd.DataFrame(X_train_transform, columns=new_columns)
X_test_df = pd.DataFrame(X_test_transform, columns=new_columns)

In [15]:
X_train_df.head()

Unnamed: 0,fever,cough,gender_Male,city_Delhi,city_Kolkata,city_Mumbai,age
0,100.0,0.0,1.0,1.0,0.0,0.0,27.0
1,103.0,0.0,1.0,0.0,1.0,0.0,60.0
2,104.0,0.0,1.0,0.0,1.0,0.0,16.0
3,102.0,1.0,0.0,0.0,0.0,0.0,82.0
4,99.0,0.0,0.0,0.0,0.0,0.0,22.0
