In [1]:
import pandas as pd 
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv('covid_toy.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [5]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [6]:
df.shape

(100, 6)

In [7]:
x = df.drop(columns=['has_covid'])
y = df['has_covid']

In [8]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)

In [9]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [10]:
transformer

In [23]:
x_train_transform = transformer.fit_transform(x_train)

In [12]:
transformer.fit_transform(x_train).shape

(80, 7)

In [24]:
x_test_transform = transformer.transform(x_test)

In [25]:
x_train.isnull().sum()

age       0
gender    0
fever     8
cough     0
city      0
dtype: int64

In [26]:
x_test.isnull().sum()

age       0
gender    0
fever     2
cough     0
city      0
dtype: int64

In [27]:
lr = LogisticRegression(max_iter=1000)
lr

In [29]:
lr.fit(x_train_transform,y_train)

In [30]:
y_pred = lr.predict(x_test_transform)
y_pred

array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No'],
      dtype=object)

In [31]:
pd.DataFrame({'y_test' : y_test, 'y_predict' : y_pred})

Unnamed: 0,y_test,y_predict
91,Yes,No
2,No,No
54,Yes,No
36,No,No
92,No,No
80,Yes,No
1,Yes,No
49,No,No
66,No,No
97,No,No


In [32]:
print('Accuracy Score: ',accuracy_score(y_test,y_pred))

Accuracy Score:  0.5


In [33]:
print('Classification Report: ', classification_report(y_test,y_pred))

Classification Report:                precision    recall  f1-score   support

          No       0.44      1.00      0.62         8
         Yes       1.00      0.17      0.29        12

    accuracy                           0.50        20
   macro avg       0.72      0.58      0.45        20
weighted avg       0.78      0.50      0.42        20

