In [1]:
# %pip install numpy pandas scikit-learn

In [2]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
df = pd.read_csv("./covid_toy.csv")

In [4]:
df.shape

(100, 6)

In [5]:
df.sample(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
53,83,Male,98.0,Mild,Delhi,Yes
4,65,Female,101.0,Mild,Mumbai,No
49,44,Male,104.0,Mild,Mumbai,No
35,82,Female,102.0,Strong,Bangalore,No
68,54,Female,104.0,Strong,Kolkata,No


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [7]:
df.describe()

Unnamed: 0,age,fever
count,100.0,90.0
mean,44.22,100.844444
std,24.878931,2.054926
min,5.0,98.0
25%,20.0,99.0
50%,45.0,101.0
75%,66.5,102.75
max,84.0,104.0


In [8]:
df["city"].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [9]:
df["cough"].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [10]:
df["gender"].value_counts()

gender
Female    59
Male      41
Name: count, dtype: int64

In [11]:
# there are missing values in fever and can apply OHE for all and label encoding for the covid

In [12]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("has_covid", axis=1), df["has_covid"], test_size=0.2, random_state=42
)

In [14]:
X_train.shape, X_test.shape

((80, 5), (20, 5))

In [15]:
SI = SimpleImputer(strategy="mean") # {'median', 'constant', 'most_frequent', 'mean'}

X_train_fever = SI.fit_transform(X_train[["fever"]])
X_test_fever = SI.fit_transform(X_test[["fever"]])

In [16]:
X_train_fever.shape

(80, 1)

In [17]:
OE = OrdinalEncoder(categories = [["Mild", "Strong"]])
X_train_cough = OE.fit_transform(X_train[["cough"]])
X_test_cough = OE.transform(X_test[["cough"]])

In [18]:
X_train_cough.shape

(80, 1)

In [19]:
OHE = OneHotEncoder(dtype=np.int32,drop="first", sparse_output=False)

X_train_gender_city = OHE.fit_transform(X_train[["gender", "city"]])
X_test_gender_city = OHE.fit_transform(X_test[["gender", "city"]])

In [20]:
X_train_gender_city.shape, OHE.categories_

((80, 4),
 [array(['Female', 'Male'], dtype=object),
  array(['Bangalore', 'Delhi', 'Kolkata', 'Mumbai'], dtype=object)])

In [21]:
X_train_age = X_train.drop(columns=["city","gender","cough", "fever"]).values
X_test_age = X_test.drop(columns=["city","gender","cough", "fever"]).values

In [22]:
X_train_age.shape

(80, 1)

In [23]:
X_train_transformed = np.concatenate((X_train_age, X_train_fever, X_train_gender_city, X_train_cough), axis=1)
X_test_transformed = np.concatenate((X_test_age, X_test_fever, X_test_gender_city, X_test_cough), axis=1)

In [24]:
X_train_transformed.shape

(80, 7)

# using Scikit-learn transformers

In [25]:
import pandas as pd
import numpy as np

In [26]:
df = pd.read_csv("./covid_toy.csv")

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(["has_covid"], axis=1), df[["has_covid"]], test_size=0.2)

In [28]:
X_train.shape

(80, 5)

In [29]:
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [30]:
transformer = ColumnTransformer(transformers = [
    ("tnf1", SimpleImputer(), ["fever"]),
    ("tnf2", OrdinalEncoder(categories=[["Mild", "Strong"]]), ["cough"]),
    ("tnf3", OneHotEncoder(sparse_output=False, drop="first"), ["gender","city"])
], remainder = "passthrough") # another remainder is drop

In [31]:
X_train_transformed = transformer.fit_transform(X_train)

In [32]:
X_train_transformed.shape

(80, 7)