In [2]:
## Data ---> Seperate (Categorical, Numerical)
# Categorical Data ---> Missing values fill ---> Encode
# Numerical Data ---> Missing values fill ---> Standardize
# Apply ---> ML Model
# Calculate Model's Preformance

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('C:\\Users\\vedan\\OneDrive\\Documents\\Regex\\Python\\ML\\CSV Files\\covid_toy.csv')

In [5]:
df.head()


Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103,Mild,Kolkata,No
1,27,Male,100,Mild,Delhi,Yes
2,42\n31,Male\nFemale,101\n98,Mild\nMild,Delhi\nKolkata,No\nNo
3,65,Female,101,Mild,Mumbai,No
4,84,Female,,Mild,Bangalore,Yes


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
x = df.drop(columns=['has_covid'])
y= df['has_covid']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=56)

In [9]:
categorical_features = ['gender', 'city']
numerical_features = ['age', 'fever']

df[numerical_features] = df[numerical_features].apply(
    pd.to_numeric, errors='coerce'
)
df[numerical_features] = df[numerical_features].fillna(df[numerical_features].mean())



In [10]:
## Now we will create our transformers
numeric_transformers = Pipeline(steps= [
    ('imputer',SimpleImputer(strategy= 'mean')),
    ('scale', StandardScaler()) 
])

categoric_transformers = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(handle_unknown= 'ignore'))
])


In [11]:
## Combine Transformers
preprocessor = ColumnTransformer(transformers= [
    ('num', numeric_transformers, numerical_features),
    ('cat', categoric_transformers, categorical_features)
])


In [12]:
## Creating Pipeline for models
rf = Pipeline(steps= [
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [13]:
df.isnull().sum()

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

In [14]:
numerical_features = ['age', 'fever']

df[numerical_features] = df[numerical_features].replace(
    r'[^0-9.]', '', regex=True
)


In [15]:
df[numerical_features] = df[numerical_features].apply(
    pd.to_numeric, errors='coerce'
)


In [16]:
df[numerical_features] = df[numerical_features].fillna(
    df[numerical_features].mean()
)


In [17]:
df[numerical_features].dtypes



age      float64
fever    float64
dtype: object

In [18]:
df.head(11)


Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60.0,Male,103.0,Mild,Kolkata,No
1,27.0,Male,100.0,Mild,Delhi,Yes
2,40.653846,Male\nFemale,100.942308,Mild\nMild,Delhi\nKolkata,No\nNo
3,65.0,Female,101.0,Mild,Mumbai,No
4,84.0,Female,100.942308,Mild,Bangalore,Yes
5,40.653846,Male\nFemale,101.0,Strong\nStrong,Bangalore\nMumbai,No\nYes
6,19.0,Female,100.0,Strong,Bangalore,No
7,64.0,Female,101.0,Mild,Delhi,No
8,40.653846,Female\nFemale,98.0,Mild\nMild,Delhi\nMumbai,No\nYes
9,25.0,Female,99.0,Strong,Kolkata,No


In [20]:
df[numerical_features].head(20)


Unnamed: 0,age,fever
0,60.0,103.0
1,27.0,100.0
2,40.653846,100.942308
3,65.0,101.0
4,84.0,100.942308
5,40.653846,101.0
6,19.0,100.0
7,64.0,101.0
8,40.653846,98.0
9,25.0,99.0


In [21]:
df[numerical_features].applymap(type)


  df[numerical_features].applymap(type)


Unnamed: 0,age,fever
0,<class 'float'>,<class 'float'>
1,<class 'float'>,<class 'float'>
2,<class 'float'>,<class 'float'>
3,<class 'float'>,<class 'float'>
4,<class 'float'>,<class 'float'>
...,...,...
71,<class 'float'>,<class 'float'>
72,<class 'float'>,<class 'float'>
73,<class 'float'>,<class 'float'>
74,<class 'float'>,<class 'float'>


In [22]:
for col in numerical_features:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(r"\s+", "", regex=True)  # remove \n, spaces
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")


In [23]:
df[numerical_features].dtypes
df[numerical_features].isna().sum()


age      0
fever    0
dtype: int64