# Arulin Vs. Novadra: Clinical Trial Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Gather

In [21]:
patients = pd.read_csv("patients.csv")
treatments = pd.concat([pd.read_csv("treatments.csv"), pd.read_csv("treatments_cut.csv")]) 
reactions = pd.read_csv("adverse_reactions.csv")


## Assess

In [4]:
patients.sample(5)

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
187,188,male,Władysław,Wieczorek,2338 Virginia Street,Chicago,IL,60605.0,United States,773-607-2647WladyslawWieczorek@teleworm.us,1/15/1994,133.1,67,20.8
215,216,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
99,100,male,Bernd,Schneider,1262 Deans Lane,Westbury,New York,11590.0,United States,BerndSchneider@jourrapide.com914-830-3940,3/24/1993,212.5,71,29.6
170,171,male,Chikere,Achebe,321 Briercliff Road,Brooklyn,New York,11227.0,United States,718-628-9500ChikereAchebe@jourrapide.com,1/12/1939,215.4,67,33.7
189,190,female,Kisanet,Selassie,3227 Park Avenue,Sacramento,California,95817.0,United States,KisanetSelassie@gustr.com+1 (916) 453-3601,4/26/1956,177.1,62,32.4


In [5]:
patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   patient_id    503 non-null    int64  
 1   assigned_sex  503 non-null    object 
 2   given_name    503 non-null    object 
 3   surname       503 non-null    object 
 4   address       491 non-null    object 
 5   city          491 non-null    object 
 6   state         491 non-null    object 
 7   zip_code      491 non-null    float64
 8   country       491 non-null    object 
 9   contact       491 non-null    object 
 10  birthdate     503 non-null    object 
 11  weight        503 non-null    float64
 12  height        503 non-null    int64  
 13  bmi           503 non-null    float64
dtypes: float64(3), int64(2), object(9)
memory usage: 55.1+ KB


In [8]:
patients[["assigned_sex", "given_name", "surname", "address"]].duplicated().sum()

5

In [9]:
(patients["given_name"] + patients["surname"]).value_counts()

JohnDoe                6
MichaelSmith           1
LindaLundy             1
EsperanzaLabrosse      1
AydenAllan             1
                      ..
FlaviaFiorentino       1
EmyMarkus              1
RegoloNucci            1
ChukwumogeOgochukwu    1
PlacidoUdinesi         1
Length: 498, dtype: int64

In [10]:
patients[patients.surname == "Doe"]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
215,216,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
229,230,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
237,238,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
244,245,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
251,252,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
277,278,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4


In [11]:
patients["address"].value_counts()

123 Main Street           6
648 Old Dear Lane         2
2476 Fulton Street        2
2778 North Avenue         2
2549 Pearlman Avenue      1
                         ..
4912 Hart Country Lane    1
1717 Vineyard Drive       1
4649 Worley Avenue        1
1990 Spring Avenue        1
4640 Windy Ridge Road     1
Name: address, Length: 483, dtype: int64

In [12]:
addresses = ["648 Old Dear Lane", "2476 Fulton Street", "2778 North Avenue"]
patients[patients.address.isin(addresses)]



Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
24,25,male,Jakob,Jakobsen,648 Old Dear Lane,Port Jervis,New York,12771.0,United States,JakobCJakobsen@einrot.com+1 (845) 858-7707,8/1/1985,155.8,67,24.4
29,30,male,Jake,Jakobsen,648 Old Dear Lane,Port Jervis,New York,12771.0,United States,JakobCJakobsen@einrot.com+1 (845) 858-7707,8/1/1985,155.8,67,24.4
97,98,male,Patrick,Gersten,2778 North Avenue,Burr,NE,68324.0,United States,PatrickGersten@rhyta.com402-848-4923,5/3/1954,138.2,71,19.3
131,132,female,Sandra,Taylor,2476 Fulton Street,Rainelle,WV,25962.0,United States,304-438-2648SandraCTaylor@dayrep.com,10/23/1960,206.1,64,35.4
282,283,female,Sandy,Taylor,2476 Fulton Street,Rainelle,WV,25962.0,United States,304-438-2648SandraCTaylor@dayrep.com,10/23/1960,206.1,64,35.4
502,503,male,Pat,Gersten,2778 North Avenue,Burr,Nebraska,68324.0,United States,PatrickGersten@rhyta.com402-848-4923,5/3/1954,138.2,71,19.3


In [13]:
patients.describe()

Unnamed: 0,patient_id,zip_code,weight,height,bmi
count,503.0,491.0,503.0,503.0,503.0
mean,252.0,49084.118126,173.43499,66.634195,27.483897
std,145.347859,30265.807442,33.916741,4.411297,5.276438
min,1.0,1002.0,48.8,27.0,17.1
25%,126.5,21920.5,149.3,63.0,23.3
50%,252.0,48057.0,175.3,67.0,27.2
75%,377.5,75679.0,199.5,70.0,31.75
max,503.0,99701.0,255.9,79.0,37.7


In [15]:
mask = (patients.weight < 60)
patients[mask]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
210,211,female,Camilla,Zaitseva,4689 Briarhill Lane,Wooster,OH,44691.0,United States,330-202-2145CamillaZaitseva@superrito.com,11/26/1938,48.8,63,19.1


In [16]:
mask = (patients.height < 48)
patients[mask]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,334-515-7487TimNeudorf@cuvox.de,2/18/1928,192.3,27,26.1


In [17]:
mask = (patients.height > 96)
patients[mask]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi


In [18]:
treatments.sample(5)

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
80,hideki,haraguchi,-,37u - 35u,7.59,7.05,0.54
69,ivan,fomin,25u - 32u,-,9.12,8.73,0.39
27,mizuki,iwata,-,45u - 46u,7.7,7.23,0.97
152,lewis,webb,39u - 44u,-,7.85,7.52,0.33
43,jens,poulsen,-,36u - 32u,7.55,7.16,0.39


In [22]:
treatments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350 entries, 0 to 69
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    350 non-null    object 
 1   surname       350 non-null    object 
 2   auralin       350 non-null    object 
 3   novodra       350 non-null    object 
 4   hba1c_start   350 non-null    float64
 5   hba1c_end     350 non-null    float64
 6   hba1c_change  213 non-null    float64
dtypes: float64(3), object(4)
memory usage: 21.9+ KB


In [23]:
treatments.describe()


Unnamed: 0,hba1c_start,hba1c_end,hba1c_change
count,350.0,350.0,213.0
mean,7.956343,7.560057,0.540657
std,0.545328,0.545456,0.277417
min,7.5,7.01,0.2
25%,7.65,7.27,0.34
50%,7.785,7.4,0.38
75%,7.95,7.5575,0.92
max,9.95,9.58,0.99


#### `patients` table:
   - **state** column appears in full and short forms
   - missing data in columns **address**, **city**, **state**, **zip_code**, **country** and **contact**
   - **contact** column have both email and phone number
   - **zip_code** column have some values of 4 digits and appear as float
   - **birthdate** column is string not datetime
   - patient 'John Doe' is duplicated
   - duplicated patients found by duplicated addresses
   - **weight** is inaccurate for 'Camilla Zaitseva'
   - **height** is inaccurate for 'Tim Neudorf'
   - 
   
#### `treatments` table:
   - lower case names in columns **given_name** and **surname**
   - missing data is represented by '-'
   - columns **arulina** and **novadra** have extra letter 'u' and contain two pieces of information
   - column **hba1c_change** have missing and inaccurate data
   
#### `adverse_reactions` table:
   - names are in lower case
   - should be part of `treatments` table


## Clean

In [24]:
patients_clean = patients.copy()
treatments_clean = treatments.copy()
reactions_clean = reactions.copy()

#### Define
   - change given_name and surname to title case in tables `treatments` and `adverse_reactions`, then join the tables

#### Code

In [33]:
treatments_clean["given_name"] = treatments_clean["given_name"].str.title()
treatments_clean["surname"] = treatments_clean["surname"].str.title()

reactions_clean["given_name"] = reactions_clean["given_name"].str.title()
reactions_clean["surname"] = reactions_clean["surname"].str.title()

treatments_clean = treatments_clean.merge(reactions_clean, on=["given_name", "surname"], how="outer")

#### Test

In [36]:
treatments_clean

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change,adverse_reaction
0,Veronika,Jindrová,41u - 48u,-,7.63,7.20,,
1,Elliot,Richardson,-,40u - 45u,7.56,7.09,0.97,hypoglycemia
2,Yukitaka,Takenaka,-,39u - 36u,7.68,7.25,,
3,Skye,Gormanston,33u - 36u,-,7.97,7.62,0.35,
4,Alissa,Montez,-,33u - 29u,7.78,7.46,0.32,
...,...,...,...,...,...,...,...,...
345,Rovzan,Kishiev,32u - 37u,-,7.75,7.41,0.34,
346,Jakob,Jakobsen,-,28u - 26u,7.96,7.51,0.95,hypoglycemia
347,Bernd,Schneider,48u - 56u,-,7.74,7.44,0.30,
348,Berta,Napolitani,-,42u - 44u,7.68,7.21,,injection site discomfort


## Visualize

## Model


## Save