# Final Bootcamp Project

## Predicting Monkey-Pox in different patients


#### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import yaml

#### Open file

In [2]:
with open('../params.yaml') as file:
    config = yaml.safe_load(file)

In [11]:
data = pd.read_csv(config['data']['raw'])
data.head()

Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,P0,,False,True,True,True,False,True,False,False,Negative
1,P1,Fever,True,False,True,True,False,False,True,False,Positive
2,P2,Fever,False,True,True,False,False,False,True,False,Positive
3,P3,,True,False,False,False,True,True,True,False,Positive
4,P4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [4]:
data.shape

(25000, 11)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Patient_ID                      25000 non-null  object
 1   Systemic Illness                25000 non-null  object
 2   Rectal Pain                     25000 non-null  bool  
 3   Sore Throat                     25000 non-null  bool  
 4   Penile Oedema                   25000 non-null  bool  
 5   Oral Lesions                    25000 non-null  bool  
 6   Solitary Lesion                 25000 non-null  bool  
 7   Swollen Tonsils                 25000 non-null  bool  
 8   HIV Infection                   25000 non-null  bool  
 9   Sexually Transmitted Infection  25000 non-null  bool  
 10  MonkeyPox                       25000 non-null  object
dtypes: bool(8), object(3)
memory usage: 781.4+ KB


In [6]:
data.describe()

Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
count,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000
unique,25000,4,2,2,2,2,2,2,2,2,2
top,P0,Fever,False,True,True,False,True,True,True,False,Positive
freq,1,6382,12655,12554,12612,12514,12527,12533,12584,12554,15909


#### Standardise column names

In [7]:
def standardize_column_names(data):
    cols = []
    for col in data.columns:
        col = col.replace(" ", "_")
        cols.append(col.lower())

    data.columns = cols
    return data

data = standardize_column_names(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   patient_id                      25000 non-null  object
 1   systemic_illness                25000 non-null  object
 2   rectal_pain                     25000 non-null  bool  
 3   sore_throat                     25000 non-null  bool  
 4   penile_oedema                   25000 non-null  bool  
 5   oral_lesions                    25000 non-null  bool  
 6   solitary_lesion                 25000 non-null  bool  
 7   swollen_tonsils                 25000 non-null  bool  
 8   hiv_infection                   25000 non-null  bool  
 9   sexually_transmitted_infection  25000 non-null  bool  
 10  monkeypox                       25000 non-null  object
dtypes: bool(8), object(3)
memory usage: 781.4+ KB


#### Checking for NaN values

In [8]:
data.isna().sum()

patient_id                        0
systemic_illness                  0
rectal_pain                       0
sore_throat                       0
penile_oedema                     0
oral_lesions                      0
solitary_lesion                   0
swollen_tonsils                   0
hiv_infection                     0
sexually_transmitted_infection    0
monkeypox                         0
dtype: int64

##### Result: No NaN values

#### Check for duplicates

In [9]:
data['patient_id'].duplicated().value_counts()

False    25000
Name: patient_id, dtype: int64

##### Result: No duplicates

#### Saving cleaned data

In [10]:
data.to_csv(config['data']['clean'], index = False)