In [46]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


# Importing csv files

In [47]:
patients=pd.read_csv('Data cleaning.patients.csv')
treatments=pd.read_csv('Data cleaning.treatments.csv')
treatments_cut=pd.read_csv('Data cleaning.treatments_cut.csv')
adverse_reactions=pd.read_csv('Data cleaning .adverse_reactions.csv')

# Purpose

In [None]:
The primary objective of this analysis is to determine whether Auralin exhibits comparable effectiveness to Novodra in establishing a baseline HbA1c level.


# Additional information

In [None]:
Insulin resistance varies person to person, which is why both starting median daily dose and ending median daily dose are required, i.e., to
calculate change in dose.

It is important to test drugs and medical products in the people they are meant to help. People of different age, race, sex, and ethnic group
must be included in clinical trials. This diversity is reflected in the patients table.


## Assesing data


In [None]:
In this step, the data is to be understood more deeply. Before implementing methods to clean it

## Manual assesment

In [48]:
# Import the pandas library
import pandas as pd

# Assuming patients, treatments, treatment_cut, and adverse_reactions are DataFrames

# To manually assess the data, export the files into an Excel spreadsheet for better user interface
with pd.ExcelWriter('clinical_trials.xlsx') as writer:
    patients.to_excel(writer, sheet_name='patients')
    treatments.to_excel(writer, sheet_name='treatments')
    treatments_cut.to_excel(writer, sheet_name='treatment_cut')
    adverse_reactions.to_excel(writer, sheet_name='adverse_reactions')


## finding issues with dataset

### Dirty data 

In [None]:
Dirty Data (Data with Quality issues):
Dirty data, also known as low quality data. Low quality data has content issues.


Duplicated data,

Missing Data,

Corrupt Data,

Inaccurate Data


In [None]:

Table= `patients`
-patient_id= some rows have misspelled name                                          <accuracy issue>
- state= sometime contain full name sometime abbreviation                            <consistency issue>
- Zip_code = column has enties with 4 digit                                          <validity issue>
- address,city,state,zip_code,country,contact = 12 data missing for these columns    <completion issue>
- assigned_sex, zip_code ,birthdate = Incorrect data type                            <validity issue>
- given_name,surname = duplicate enties in the name of john doe                      <accuracy issue>
- weight = one patient has weight of 48 pounds                                                                                    <accuracy issue>
- height = one patient has hight of 27 inches                                        <accuracy issue>


Table = `treatments & treatments_cut`                                                  
-given_name and surname = all the rows are in lower case                              <consistency issue>
-auralin and novodra columns = remove u from auralin and novodra columns              <validity issue>
-auralin and novodra columns = '-'  is treated as null but in python it will be count <validity issue>
-Hba1c_change  = there are missing values                                             <completion issue>
- 1 duplicate entry by the name of joseph dey                                         <accuracy issue>
-hba1c_change = in this column 9 is instead of 4                                      <accuracy issue>

Table = `adverse_reactions`  
given_name and surname = all the rows are in lower case                               <consistency issue>


### Messy Data

In [None]:
 Messy Data When the data has structural issue is it known as Messy or untidy data.

Tidy data has the following properties:

Each variable forms a column

Each observation forms a row

Each observational unit forms a table

In [None]:
patients

Table= `patients`
-contact= contains both phone no and email id

Table =`treatments & treatments_cuts`
-auralin =it should be split in to two columns start and end dose
-novodra = it should be split in to two columns start and end dose

Table = `adverse reaction`
-This table should not exist independently


## Programatic assesment

In [49]:
patients['patient_id'].duplicated().sum()

0

In [50]:
patients[patients.duplicated(subset= ['given_name','surname'])]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
229,230,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
237,238,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
244,245,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
251,252,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
277,278,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4


In [51]:
patients.describe()

Unnamed: 0,patient_id,zip_code,weight,height,bmi
count,503.0,491.0,503.0,503.0,503.0
mean,252.0,49084.118126,173.43499,66.634195,27.483897
std,145.347859,30265.807442,33.916741,4.411297,5.276438
min,1.0,1002.0,48.8,27.0,17.1
25%,126.5,21920.5,149.3,63.0,23.3
50%,252.0,48057.0,175.3,67.0,27.2
75%,377.5,75679.0,199.5,70.0,31.75
max,503.0,99701.0,255.9,79.0,37.7


In [11]:
patients[patients['weight']== 48.800000]


Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
210,211,female,Camilla,Zaitseva,4689 Briarhill Lane,Wooster,OH,44691.0,United States,330-202-2145CamillaZaitseva@superrito.com,11/26/1938,48.8,63,19.1


In [52]:
patients[patients['height']== 27.000000]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,334-515-7487TimNeudorf@cuvox.de,2/18/1928,192.3,27,26.1


In [53]:
treatments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    280 non-null    object 
 1   surname       280 non-null    object 
 2   auralin       280 non-null    object 
 3   novodra       280 non-null    object 
 4   hba1c_start   280 non-null    float64
 5   hba1c_end     280 non-null    float64
 6   hba1c_change  171 non-null    float64
dtypes: float64(3), object(4)
memory usage: 15.4+ KB


In [54]:
treatments.duplicated().sum()
treatments[treatments.duplicated()]

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
136,joseph,day,29u - 36u,-,7.7,7.19,


In [55]:
treatments.describe()

Unnamed: 0,hba1c_start,hba1c_end,hba1c_change
count,280.0,280.0,171.0
mean,7.985929,7.589286,0.546023
std,0.568638,0.569672,0.279555
min,7.5,7.01,0.2
25%,7.66,7.27,0.34
50%,7.8,7.42,0.38
75%,7.97,7.57,0.92
max,9.95,9.58,0.99


In [56]:
treatments.sort_values('hba1c_change')

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
275,albina,zetticci,45u - 51u,-,7.93,7.73,0.20
70,tosh,jensen,-,51u - 48u,7.93,7.69,0.24
237,manouck,wubbels,55u - 62u,-,7.66,7.40,0.26
156,chidalu,onyekaozulu,-,42u - 41u,7.54,7.27,0.27
200,nicolas,ferreira,43u - 51u,-,7.99,7.72,0.27
...,...,...,...,...,...,...,...
269,hiromu,horikawa,-,47u - 46u,7.77,7.28,
271,leo,vieira,-,30u - 33u,7.74,7.36,
273,kate,wilkinson,36u - 39u,-,7.72,7.20,
274,naja,enoksen,43u - 50u,-,7.98,7.59,


In [57]:
treatments_cut.sort_values('hba1c_change')

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
40,eufemio,rosario,-,37u - 40u,7.54,7.26,0.28
46,anja,mueller,43u - 56u,-,7.58,7.29,0.29
67,bernd,schneider,48u - 56u,-,7.74,7.44,0.30
59,robert,maslov,55u - 68u,-,7.65,7.35,0.30
25,siebrigje,koldenhof,30u - 38u,-,7.90,7.59,0.31
...,...,...,...,...,...,...,...
60,maret,sultygov,-,26u - 23u,7.67,7.30,
62,fakhri,fakhoury,39u - 50u,-,7.83,7.39,
63,žarka,rap,35u - 48u,-,7.54,7.15,
68,berta,napolitani,-,42u - 44u,7.68,7.21,


In [58]:
adverse_reactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   given_name        34 non-null     object
 1   surname           34 non-null     object
 2   adverse_reaction  34 non-null     object
dtypes: object(3)
memory usage: 948.0+ bytes


In [59]:
adverse_reactions.duplicated().sum()

0

## Note - Assesing is an iterative process

### Always make copy before cleaning

In [60]:
patients_copy=patients.copy()
treatment_copy=treatments.copy()
treatment_cut_copy=treatments_cut.copy() 
adverse_copy=adverse_reactions.copy()

#### first solve completeness issue

In [61]:
patients_copy.fillna('no data available', inplace = True)
# REPLACE MISSINNG DATA WITH NO DATA AVAILABLE

In [62]:
treatment_copy.info()

# SUBTRACT hbalc_start TO hbalc_end TO GET ALL THE MISSING Hba1c_change
treatment_copy['hba1c_changes']=treatment_copy['hba1c_start'] - treatment_copy['hba1c_end']
# test
treatment_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    280 non-null    object 
 1   surname       280 non-null    object 
 2   auralin       280 non-null    object 
 3   novodra       280 non-null    object 
 4   hba1c_start   280 non-null    float64
 5   hba1c_end     280 non-null    float64
 6   hba1c_change  171 non-null    float64
dtypes: float64(3), object(4)
memory usage: 15.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   given_name     280 non-null    object 
 1   surname        280 non-null    object 
 2   auralin        280 non-null    object 
 3   novodra        280 non-null    object 
 4   hba1c_start    280 non-null    float64
 5   hba1c_end      280 non-null    float64


In [63]:
treatment_cut_copy['hba1c_changes']=treatment_cut_copy['hba1c_start']-treatment_cut_copy['hba1c_end']
treatment_cut_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   given_name     70 non-null     object 
 1   surname        70 non-null     object 
 2   auralin        70 non-null     object 
 3   novodra        70 non-null     object 
 4   hba1c_start    70 non-null     float64
 5   hba1c_end      70 non-null     float64
 6   hba1c_change   42 non-null     float64
 7   hba1c_changes  70 non-null     float64
dtypes: float64(4), object(4)
memory usage: 4.5+ KB


#### solveing tidyness issue 

In [64]:
#contact column have to fix
# Extract phone numbers and email addresses separately using regular expressions
patients_copy['phone_number'] = patients_copy['contact'].str.extract(r'(\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})')
patients_copy['email'] = patients_copy['contact'].str.extract(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})')

In [65]:
# Strip any leading or trailing spaces from the 'email' column
patients_copy['email'] = patients_copy['email'].str.strip()

In [66]:
# Drop the original 'contact' column
patients_copy.drop(columns=['contact'], inplace=True)


In [67]:
import re 

# Convert NaN values to empty strings
patients_copy['email'] = patients_copy['email'].fillna('')

In [68]:
# Extract phone numbers from the beginning of email addresses
patients_copy['phone_number_two'] = patients_copy['email'].apply(lambda x: re.match(r'^\d{3}-\d{3}-\d{4}', str(x)).group() if re.match(r'^\d{3}-\d{3}-\d{4}', str(x)) else np.nan)



In [69]:
# Remove phone numbers from email addresses using regular expressions
patients_copy['email'] = patients_copy['email'].str.replace(r'^\d{3}-\d{3}-\d{4}', '', regex=True)


In [70]:
# Create a new column 'phn-no' and fill it with values from 'phone_number'
patients_copy['phn-no'] = patients_copy['phone_number']

In [71]:
# Replace NaN values in 'phn-no' with values from 'phone_number_two'
patients_copy['phn-no'].fillna(patients_copy['phone_number_two'], inplace=True)

In [72]:
# Drop 'phone_number' and 'phone_number_two' columns
patients_copy.drop(columns=['phone_number', 'phone_number_two'], inplace=True)

In [73]:
patients_copy

#concating treatment_copy and treatment_cut_copy

treatment_copy = pd.concat([treatment_copy,treatment_cut_copy])

treatment_copy.info()
treatment_copy

<class 'pandas.core.frame.DataFrame'>
Index: 350 entries, 0 to 69
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   given_name     350 non-null    object 
 1   surname        350 non-null    object 
 2   auralin        350 non-null    object 
 3   novodra        350 non-null    object 
 4   hba1c_start    350 non-null    float64
 5   hba1c_end      350 non-null    float64
 6   hba1c_change   213 non-null    float64
 7   hba1c_changes  350 non-null    float64
dtypes: float64(4), object(4)
memory usage: 24.6+ KB


Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change,hba1c_changes
0,veronika,jindrová,41u - 48u,-,7.63,7.20,,0.43
1,elliot,richardson,-,40u - 45u,7.56,7.09,0.97,0.47
2,yukitaka,takenaka,-,39u - 36u,7.68,7.25,,0.43
3,skye,gormanston,33u - 36u,-,7.97,7.62,0.35,0.35
4,alissa,montez,-,33u - 29u,7.78,7.46,0.32,0.32
...,...,...,...,...,...,...,...,...
65,rovzan,kishiev,32u - 37u,-,7.75,7.41,0.34,0.34
66,jakob,jakobsen,-,28u - 26u,7.96,7.51,0.95,0.45
67,bernd,schneider,48u - 56u,-,7.74,7.44,0.30,0.30
68,berta,napolitani,-,42u - 44u,7.68,7.21,,0.47


In [74]:
# aurain and novodra must be in a same column under medicine type 
treatment_copy=treatment_copy.melt(id_vars=['given_name','surname','hba1c_start','hba1c_end','hba1c_change','hba1c_changes'],var_name='medicine type',value_name='doasge_range')


In [75]:
treatment_copy.info()
treatment_copy

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   given_name     700 non-null    object 
 1   surname        700 non-null    object 
 2   hba1c_start    700 non-null    float64
 3   hba1c_end      700 non-null    float64
 4   hba1c_change   426 non-null    float64
 5   hba1c_changes  700 non-null    float64
 6   medicine type  700 non-null    object 
 7   doasge_range   700 non-null    object 
dtypes: float64(4), object(4)
memory usage: 43.9+ KB


Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_change,hba1c_changes,medicine type,doasge_range
0,veronika,jindrová,7.63,7.20,,0.43,auralin,41u - 48u
1,elliot,richardson,7.56,7.09,0.97,0.47,auralin,-
2,yukitaka,takenaka,7.68,7.25,,0.43,auralin,-
3,skye,gormanston,7.97,7.62,0.35,0.35,auralin,33u - 36u
4,alissa,montez,7.78,7.46,0.32,0.32,auralin,-
...,...,...,...,...,...,...,...,...
695,rovzan,kishiev,7.75,7.41,0.34,0.34,novodra,-
696,jakob,jakobsen,7.96,7.51,0.95,0.45,novodra,28u - 26u
697,bernd,schneider,7.74,7.44,0.30,0.30,novodra,-
698,berta,napolitani,7.68,7.21,,0.47,novodra,42u - 44u


In [76]:
#auralin dosage =it should be split in to two columns start and end dose
#novodra dosage = it should be split in to two columns start and end dose
treatment_copy=treatment_copy[treatment_copy['doasge_range'] != '-']


In [77]:
treatment_copy['dosage_start']=treatment_copy['doasge_range'].str.split('-').str.get(0)
treatment_copy['dosage_end']=treatment_copy['doasge_range'].str.split('-').str.get(1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treatment_copy['dosage_start']=treatment_copy['doasge_range'].str.split('-').str.get(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treatment_copy['dosage_end']=treatment_copy['doasge_range'].str.split('-').str.get(1)


In [78]:
treatment_copy.drop(columns=['doasge_range','hba1c_change'],inplace=True)

treatment_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 350 entries, 0 to 698
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   given_name     350 non-null    object 
 1   surname        350 non-null    object 
 2   hba1c_start    350 non-null    float64
 3   hba1c_end      350 non-null    float64
 4   hba1c_changes  350 non-null    float64
 5   medicine type  350 non-null    object 
 6   dosage_start   350 non-null    object 
 7   dosage_end     350 non-null    object 
dtypes: float64(3), object(5)
memory usage: 24.6+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treatment_copy.drop(columns=['doasge_range','hba1c_change'],inplace=True)


In [79]:
# in column dosage_start,dosage_end u should not be there  datatype should be int
treatment_copy['dosage_start']=treatment_copy['dosage_start'].str.replace("u","")
treatment_copy['dosage_end']=treatment_copy['dosage_end'].str.replace("u","")
# to change datatype in to int
treatment_copy['dosage_start']=treatment_copy['dosage_start'].astype(int)
treatment_copy['dosage_end']=treatment_copy['dosage_end'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treatment_copy['dosage_start']=treatment_copy['dosage_start'].str.replace("u","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treatment_copy['dosage_end']=treatment_copy['dosage_end'].str.replace("u","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treatment_copy['dosage_start']=treatment_copy

In [80]:
#check if replace is done correctly and data typr change str to int
treatment_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 350 entries, 0 to 698
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   given_name     350 non-null    object 
 1   surname        350 non-null    object 
 2   hba1c_start    350 non-null    float64
 3   hba1c_end      350 non-null    float64
 4   hba1c_changes  350 non-null    float64
 5   medicine type  350 non-null    object 
 6   dosage_start   350 non-null    int32  
 7   dosage_end     350 non-null    int32  
dtypes: float64(3), int32(2), object(3)
memory usage: 21.9+ KB


In [81]:
#conbining adverse reaction table with treatment table
treatment_copy=treatment_copy.merge(adverse_copy,how='left',on=['given_name','surname'])

#### solving validity issue

In [82]:
#in patients table column name zip_code has 2 issue first data type is str should be int second some time pin code is of 4 digit but i decide to ignoor this column bcz it does not effect the conclusion that i want to find
#changing birthdate data type object to date time 
patients_copy['birthdate'] = pd.to_datetime(patients_copy['birthdate'])
#check if datatype change or not
patients_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   patient_id    503 non-null    int64         
 1   assigned_sex  503 non-null    object        
 2   given_name    503 non-null    object        
 3   surname       503 non-null    object        
 4   address       503 non-null    object        
 5   city          503 non-null    object        
 6   state         503 non-null    object        
 7   zip_code      503 non-null    object        
 8   country       503 non-null    object        
 9   birthdate     503 non-null    datetime64[ns]
 10  weight        503 non-null    float64       
 11  height        503 non-null    int64         
 12  bmi           503 non-null    float64       
 13  email         503 non-null    object        
 14  phn-no        404 non-null    object        
dtypes: datetime64[ns](1), float64(2), int64(

#### solving accuracy issue

In [83]:
# changing name from dsvid to david
#patients_copy=patients_copy.replace(patients_copy['given_name']=='Dsvid','David',inplace=True)

# Convert 'given_name' column to lowercase
patients_copy['given_name'] = patients_copy['given_name'].str.lower()
patients_copy['surname'] = patients_copy['surname'].str.lower()

In [84]:
#there are duplicate entry in the name of john doe we have to drop the duplicates
patients_copy.drop_duplicates(subset=['given_name', 'surname'], inplace=True)
patients_copy

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,email,phn-no
0,1,female,zoe,wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,1976-07-10,121.7,66,19.6,ZoeWellish@superrito.com,951-719-9170
1,2,female,pamela,hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,1967-04-03,118.8,66,19.2,PamelaSHill@cuvox.de,+1 (217) 569-3204
2,3,male,jae,debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,1980-02-19,177.8,71,24.8,JaeMDebord@gustr.com,402-363-6804
3,4,male,liêm,phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,1951-07-26,220.9,70,31.7,PhanBaLiem@jourrapide.com,+1 (732) 636-8246
4,5,male,tim,neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,1928-02-18,192.3,27,26.1,TimNeudorf@cuvox.de,334-515-7487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,499,male,mustafa,lindström,2530 Victoria Court,Milton Mills,ME,3852.0,United States,1959-04-10,181.1,72,24.6,MustafaLindstrom@jourrapide.com,207-477-0579
499,500,male,ruman,bisliev,494 Clarksburg Park Road,Sedona,AZ,86341.0,United States,1948-03-26,239.6,70,34.4,RumanBisliev@gustr.com,928-284-4492
500,501,female,jinke,de keizer,649 Nutter Street,Overland Park,MO,64110.0,United States,1971-01-13,171.2,67,26.8,JinkedeKeizer@teleworm.us,816-223-6007
501,502,female,chidalu,onyekaozulu,3652 Boone Crockett Lane,Seattle,WA,98109.0,United States,1952-02-13,176.9,67,27.7,ChidaluOnyekaozulu@jourrapide.com,1 360 443 2060


In [85]:
#change weight pound to kg
patients_copy['weight']=patients_copy['weight']*0.453592
patients_copy

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,email,phn-no
0,1,female,zoe,wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,1976-07-10,55.202146,66,19.6,ZoeWellish@superrito.com,951-719-9170
1,2,female,pamela,hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,1967-04-03,53.886730,66,19.2,PamelaSHill@cuvox.de,+1 (217) 569-3204
2,3,male,jae,debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,1980-02-19,80.648658,71,24.8,JaeMDebord@gustr.com,402-363-6804
3,4,male,liêm,phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,1951-07-26,100.198473,70,31.7,PhanBaLiem@jourrapide.com,+1 (732) 636-8246
4,5,male,tim,neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,1928-02-18,87.225742,27,26.1,TimNeudorf@cuvox.de,334-515-7487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,499,male,mustafa,lindström,2530 Victoria Court,Milton Mills,ME,3852.0,United States,1959-04-10,82.145511,72,24.6,MustafaLindstrom@jourrapide.com,207-477-0579
499,500,male,ruman,bisliev,494 Clarksburg Park Road,Sedona,AZ,86341.0,United States,1948-03-26,108.680643,70,34.4,RumanBisliev@gustr.com,928-284-4492
500,501,female,jinke,de keizer,649 Nutter Street,Overland Park,MO,64110.0,United States,1971-01-13,77.654950,67,26.8,JinkedeKeizer@teleworm.us,816-223-6007
501,502,female,chidalu,onyekaozulu,3652 Boone Crockett Lane,Seattle,WA,98109.0,United States,1952-02-13,80.240425,67,27.7,ChidaluOnyekaozulu@jourrapide.com,1 360 443 2060


In [86]:
# convert height inches to meter 
patients_copy['height']=patients_copy['height']/39.37
patients_copy

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,email,phn-no
0,1,female,zoe,wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,1976-07-10,55.202146,1.676403,19.6,ZoeWellish@superrito.com,951-719-9170
1,2,female,pamela,hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,1967-04-03,53.886730,1.676403,19.2,PamelaSHill@cuvox.de,+1 (217) 569-3204
2,3,male,jae,debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,1980-02-19,80.648658,1.803404,24.8,JaeMDebord@gustr.com,402-363-6804
3,4,male,liêm,phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,1951-07-26,100.198473,1.778004,31.7,PhanBaLiem@jourrapide.com,+1 (732) 636-8246
4,5,male,tim,neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,1928-02-18,87.225742,0.685801,26.1,TimNeudorf@cuvox.de,334-515-7487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,499,male,mustafa,lindström,2530 Victoria Court,Milton Mills,ME,3852.0,United States,1959-04-10,82.145511,1.828804,24.6,MustafaLindstrom@jourrapide.com,207-477-0579
499,500,male,ruman,bisliev,494 Clarksburg Park Road,Sedona,AZ,86341.0,United States,1948-03-26,108.680643,1.778004,34.4,RumanBisliev@gustr.com,928-284-4492
500,501,female,jinke,de keizer,649 Nutter Street,Overland Park,MO,64110.0,United States,1971-01-13,77.654950,1.701803,26.8,JinkedeKeizer@teleworm.us,816-223-6007
501,502,female,chidalu,onyekaozulu,3652 Boone Crockett Lane,Seattle,WA,98109.0,United States,1952-02-13,80.240425,1.701803,27.7,ChidaluOnyekaozulu@jourrapide.com,1 360 443 2060


In [87]:
#due to the units of height and weight change we have to calcualte proper bmi
patients_copy['bmi']=round(patients_copy['weight'] / (patients_copy['height'])**2,2)

In [88]:
from datetime import datetime

# Assuming patients_copy is your DataFrame and 'birthdate' column exists
# Convert 'birthdate' column to datetime format
patients_copy['birthdate'] = pd.to_datetime(patients_copy['birthdate'], errors='coerce')

In [90]:
# Calculate age based on current date
current_date = datetime.now()
patients_copy['age'] = (current_date - patients_copy['birthdate']).dt.days // 365

In [91]:
patients_copy

# there is a duplicate row in column given_name,surname 
treatment_copy[treatment_copy.duplicated(subset= ['given_name','surname'])]


Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_changes,medicine type,dosage_start,dosage_end,adverse_reaction
62,joseph,day,7.7,7.19,0.51,auralin,29,36,hypoglycemia


In [92]:
#Drop the duplicate column
treatment_copy = treatment_copy[~treatment_copy.duplicated(subset=['given_name', 'surname'])]
treatment_copy

Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_changes,medicine type,dosage_start,dosage_end,adverse_reaction
0,veronika,jindrová,7.63,7.20,0.43,auralin,41,48,
1,skye,gormanston,7.97,7.62,0.35,auralin,33,36,
2,sophia,haugen,7.65,7.27,0.38,auralin,37,42,
3,eddie,archer,7.89,7.55,0.34,auralin,31,38,
4,asia,woźniak,7.76,7.37,0.39,auralin,30,36,
...,...,...,...,...,...,...,...,...,...
345,christopher,woodward,7.51,7.06,0.45,novodra,55,51,nausea
346,maret,sultygov,7.67,7.30,0.37,novodra,26,23,
347,lixue,hsueh,9.21,8.80,0.41,novodra,22,23,injection site discomfort
348,jakob,jakobsen,7.96,7.51,0.45,novodra,28,26,hypoglycemia


In [93]:
#### solve consistency issue

#there are many abbreviation of states it was not possible to identify each one of them so i decidi to keep it as it was

treatment_copy.sort_values(by='hba1c_start')

patients_copy

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,email,phn-no,age
0,1,female,zoe,wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,1976-07-10,55.202146,1.676403,19.64,ZoeWellish@superrito.com,951-719-9170,47
1,2,female,pamela,hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,1967-04-03,53.886730,1.676403,19.17,PamelaSHill@cuvox.de,+1 (217) 569-3204,57
2,3,male,jae,debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,1980-02-19,80.648658,1.803404,24.80,JaeMDebord@gustr.com,402-363-6804,44
3,4,male,liêm,phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,1951-07-26,100.198473,1.778004,31.70,PhanBaLiem@jourrapide.com,+1 (732) 636-8246,72
4,5,male,tim,neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,1928-02-18,87.225742,0.685801,185.46,TimNeudorf@cuvox.de,334-515-7487,96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,499,male,mustafa,lindström,2530 Victoria Court,Milton Mills,ME,3852.0,United States,1959-04-10,82.145511,1.828804,24.56,MustafaLindstrom@jourrapide.com,207-477-0579,65
499,500,male,ruman,bisliev,494 Clarksburg Park Road,Sedona,AZ,86341.0,United States,1948-03-26,108.680643,1.778004,34.38,RumanBisliev@gustr.com,928-284-4492,76
500,501,female,jinke,de keizer,649 Nutter Street,Overland Park,MO,64110.0,United States,1971-01-13,77.654950,1.701803,26.81,JinkedeKeizer@teleworm.us,816-223-6007,53
501,502,female,chidalu,onyekaozulu,3652 Boone Crockett Lane,Seattle,WA,98109.0,United States,1952-02-13,80.240425,1.701803,27.71,ChidaluOnyekaozulu@jourrapide.com,1 360 443 2060,72


In [None]:
# These are the two files after cleaning that  i am going to use 
patients_copy.to_csv('updated_patients_detailed_table.csv',index=False)

treatment_copy.to_csv('updated_treatments_detailed_table.csv',index=False )