# Arulin Vs. Novadra: Clinical Trial Analysis

## Gather

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
patients = pd.read_csv("patients.csv")
treatments = pd.concat([pd.read_csv("treatments.csv"), pd.read_csv("treatments_cut.csv")])
reactions = pd.read_csv("adverse_reactions.csv")


In [3]:
patients.sample(5)

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
220,221,male,Mijael,Guerra,1271 Pretty View Lane,Oakland,CA,94612.0,United States,MijaelGuerraMoreno@teleworm.us+1 (707) 896-9250,1/25/1944,183.5,65,30.5
184,185,female,Ásta,Grímsdóttir,1619 Melm Street,Ormond Beach,FL,32174.0,United States,AstaGrimsdottir@dayrep.com+1 (386) 989-0019,6/9/1999,128.9,65,21.4
135,136,male,Willem-Jan,van der Lubbe,1717 Vineyard Drive,Cleveland,OH,44115.0,United States,Willem-JanvanderLubbe@gustr.com440-385-5011,7/9/1941,152.9,69,22.6
490,491,male,Jackson,Addison,1160 Taylor Street,New Rochelle,New York,10801.0,United States,914-636-9304JacksonAddison@armyspy.com,5/29/1953,192.7,69,28.5
90,91,male,Ingo,Rokavc,4271 Cherry Ridge Drive,Buffalo,New York,14214.0,United States,IngoRokavc@superrito.com+1 (585) 902-9127,8/27/1977,192.3,67,30.1


In [4]:
treatments.sample(5)

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
36,chỉ,lâm,45u - 48u,-,7.68,7.24,
61,ella,lund,38u - 47u,-,7.57,7.21,0.36
210,vanessa,ferguson,39u - 50u,-,7.97,7.58,
26,firenze,fodor,-,30u - 35u,7.89,7.55,0.34
278,vallie,prince,31u - 38u,-,7.64,7.28,0.36


In [5]:
reactions.sample(5)

Unnamed: 0,given_name,surname,adverse_reaction
7,albinca,komavec,hypoglycemia
27,idalia,moore,hypoglycemia
8,noe,aranda,hypoglycemia
17,christopher,woodward,nausea
20,anenechi,chidi,hypoglycemia


## Assess

In [6]:
patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   patient_id    503 non-null    int64  
 1   assigned_sex  503 non-null    object 
 2   given_name    503 non-null    object 
 3   surname       503 non-null    object 
 4   address       491 non-null    object 
 5   city          491 non-null    object 
 6   state         491 non-null    object 
 7   zip_code      491 non-null    float64
 8   country       491 non-null    object 
 9   contact       491 non-null    object 
 10  birthdate     503 non-null    object 
 11  weight        503 non-null    float64
 12  height        503 non-null    int64  
 13  bmi           503 non-null    float64
dtypes: float64(3), int64(2), object(9)
memory usage: 55.1+ KB


In [7]:
patients[["weight", "height", "bmi"]].describe()

Unnamed: 0,weight,height,bmi
count,503.0,503.0,503.0
mean,173.43499,66.634195,27.483897
std,33.916741,4.411297,5.276438
min,48.8,27.0,17.1
25%,149.3,63.0,23.3
50%,175.3,67.0,27.2
75%,199.5,70.0,31.75
max,255.9,79.0,37.7


In [8]:
mask = patients["weight"] < 100
patients[mask]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
210,211,female,Camilla,Zaitseva,4689 Briarhill Lane,Wooster,OH,44691.0,United States,330-202-2145CamillaZaitseva@superrito.com,11/26/1938,48.8,63,19.1


In [9]:
mask = patients["height"] < 50
patients[mask]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,334-515-7487TimNeudorf@cuvox.de,2/18/1928,192.3,27,26.1


In [10]:
patients.duplicated().sum()

0

In [11]:
patients.duplicated(subset=["surname"]).sum()

37

In [12]:
mask = patients.duplicated(subset=["surname"])
patients[mask]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
26,27,female,Ella,Lund,1207 Garfield Road,Peoria,IL,61602.0,United States,309-671-8852EllaLund@armyspy.com,12/19/1933,144.8,61,27.4
29,30,male,Jake,Jakobsen,648 Old Dear Lane,Port Jervis,New York,12771.0,United States,JakobCJakobsen@einrot.com+1 (845) 858-7707,8/1/1985,155.8,67,24.4
122,123,male,Bội,Tạ,2389 Rubaiyat Road,Grand Rapids,MI,49503.0,United States,TaHaBoi@superrito.com231-607-3625,5/30/1929,211.0,69,31.2
139,140,female,Novalie,Berg,1275 Goldie Lane,Cincinnati,OH,45202.0,United States,NovalieBerg@gustr.com+1 (513) 383-0516,4/22/1973,156.9,61,29.6
194,195,male,Urso,Aranda,1330 Lincoln Street,Hopewell Mercer,NJ,8525.0,United States,609-466-3275UrsoArandaSanchez@rhyta.com,7/3/1999,163.7,70,23.5
195,196,female,Lamara,Dratchev,3731 Swick Hill Street,New Orleans,LA,70113.0,United States,985-253-7891LamaraDratchev@teleworm.us,10/20/1954,181.3,67,28.4
203,204,female,Mùi,Lương,1778 Rodney Street,Harvester,MO,63301.0,United States,636-442-6946LuongHongMui@einrot.com,2/29/1956,192.7,60,37.6
205,206,male,Breno,Correia,3539 Bottom Lane,Tonawanda,New York,14150.0,United States,716-743-5884BrenoLimaCorreia@superrito.com,7/11/1974,180.6,67,28.3
207,208,female,Beatrycze,Woźniak,182 Cross Street,Saginaw,MI,48607.0,United States,989-936-4563BeatryczeWozniak@armyspy.com,3/14/1946,119.2,61,22.5
229,230,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4


In [14]:
patients.duplicated(subset=["contact"]).sum()

19

In [19]:
mask = patients.duplicated(subset=["contact"], keep=False)
patients.dropna(subset=["contact"])[mask]

  patients.dropna(subset=["contact"])[mask]


Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
24,25,male,Jakob,Jakobsen,648 Old Dear Lane,Port Jervis,New York,12771.0,United States,JakobCJakobsen@einrot.com+1 (845) 858-7707,8/1/1985,155.8,67,24.4
29,30,male,Jake,Jakobsen,648 Old Dear Lane,Port Jervis,New York,12771.0,United States,JakobCJakobsen@einrot.com+1 (845) 858-7707,8/1/1985,155.8,67,24.4
97,98,male,Patrick,Gersten,2778 North Avenue,Burr,NE,68324.0,United States,PatrickGersten@rhyta.com402-848-4923,5/3/1954,138.2,71,19.3
131,132,female,Sandra,Taylor,2476 Fulton Street,Rainelle,WV,25962.0,United States,304-438-2648SandraCTaylor@dayrep.com,10/23/1960,206.1,64,35.4
215,216,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
229,230,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
237,238,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
244,245,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
251,252,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
277,278,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4


In [20]:
patients["state"].value_counts()

California    36
TX            32
New York      25
CA            24
NY            22
MA            22
PA            18
GA            15
Illinois      14
OH            14
Florida       13
MI            13
OK            13
LA            13
NJ            12
VA            11
MS            10
WI            10
IL            10
IN             9
AL             9
MN             9
TN             9
FL             9
NC             8
KY             8
WA             8
MO             7
KS             6
NV             6
ID             6
SC             5
CT             5
IA             5
Nebraska       4
AR             4
CO             4
AZ             4
RI             4
ND             4
ME             4
OR             3
WV             3
DE             3
MD             3
SD             3
VT             2
DC             2
NE             2
MT             2
NM             1
AK             1
NH             1
WY             1
Name: state, dtype: int64

In [23]:
patients.isnull().sum()

patient_id       0
assigned_sex     0
given_name       0
surname          0
address         12
city            12
state           12
zip_code        12
country         12
contact         12
birthdate        0
weight           0
height           0
bmi              0
dtype: int64

In [24]:
treatments.isnull().sum()

given_name        0
surname           0
auralin           0
novodra           0
hba1c_start       0
hba1c_end         0
hba1c_change    137
dtype: int64

In [25]:
treatments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350 entries, 0 to 69
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    350 non-null    object 
 1   surname       350 non-null    object 
 2   auralin       350 non-null    object 
 3   novodra       350 non-null    object 
 4   hba1c_start   350 non-null    float64
 5   hba1c_end     350 non-null    float64
 6   hba1c_change  213 non-null    float64
dtypes: float64(3), object(4)
memory usage: 21.9+ KB


In [26]:
treatments.head()

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
0,veronika,jindrová,41u - 48u,-,7.63,7.2,
1,elliot,richardson,-,40u - 45u,7.56,7.09,0.97
2,yukitaka,takenaka,-,39u - 36u,7.68,7.25,
3,skye,gormanston,33u - 36u,-,7.97,7.62,0.35
4,alissa,montez,-,33u - 29u,7.78,7.46,0.32


In [27]:
treatments.duplicated().sum()

1

In [28]:
treatments.duplicated(subset=["surname"]).sum()

17

In [32]:
mask = treatments.duplicated(subset=["surname"], keep=False)
treatments[mask].sort_values("surname")

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
19,noe,aranda,26u - 34u,-,7.51,7.17,0.34
132,urso,aranda,30u - 38u,-,7.6,7.16,
49,satsita,batukayev,-,42u - 42u,7.63,7.25,0.38
226,daud,batukayev,-,37u - 30u,7.98,7.51,0.97
199,novalie,berg,-,32u - 31u,7.85,7.49,0.36
24,isac,berg,31u - 41u,-,9.68,9.29,0.39
219,diệt,bùi,-,30u - 32u,9.11,8.76,0.35
184,chân,bùi,31u - 42u,-,7.53,7.18,0.35
13,bernarda,cindrić,40u - 49u,-,7.89,7.55,0.34
150,manuela,cindrić,55u - 66u,-,8.07,7.76,0.31


In [33]:
reactions.head()

Unnamed: 0,given_name,surname,adverse_reaction
0,berta,napolitani,injection site discomfort
1,lena,baer,hypoglycemia
2,joseph,day,hypoglycemia
3,flavia,fiorentino,cough
4,manouck,wubbels,throat irritation


In [34]:
reactions.isnull().sum()

given_name          0
surname             0
adverse_reaction    0
dtype: int64

In [35]:
reactions.duplicated().sum()

0

#### Assess:
   - `patients` table:
       - `patient_id` and `zip_code` should be str
       - `birthdate` is str
       - `weight` and `height` are in unfamiliar units
       - inaccurate value for `weight` for patient_id 211
       - inaccurate value for `height` for patient_id 5
       - `contact` columns has two data email and number
       - duplicated patient 'John Doe'
       - null values in columns `address`, `city`, `state`, `zip_code`, `country`, `contact`
       - duplication for patient_ids (25, 30), (98, 503), (132, 283)
       - in consistent names in `state` column
       - `given_name` and `surname` should be lower case
       
    
   - `treatments` table
       - column `hba1c_change` has missing values
       - columns `auralin` and `novodra` have two values start and end dose
       - patient 'joseph day' is duplicated
       - 
   - `reactions` table

## Clean

#### Define

#### Code

#### Test

## Visualization