In [2]:
import pandas as pd
import datetime as dt
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("./Dataset.csv")
df.head()

Unnamed: 0,AAVDB,NCT Number,Title,Drug ID,Status,No of pts,Vector,Safety met,Efficacy met,Administration Therapeutic route,Administration Therapeutic Area,Phases,Primary Completion
0,1,NCT02341807,Safety and Dose Escalation Study of AAV2-hCHM ...,SPK-7001,"Active, not recruiting",10.0,AAV2,Yes,No,Subretinal,Ophthalmology,Phase 1/2,1/10/2019
1,3,NCT02396342,Trial of AAV5-hFIX in Severe or Moderately Sev...,AMT-061,"Active, not recruiting",10.0,AAV5,Yes,Yes,Intravenous,Hematology,Phase 1/2,1/5/2021
2,4,NCT01637805,Clinical Safety and Preliminary Efficacy of AA...,,Unknown status,20.0,,,,,Oncology,Phase 1,1/10/2016
3,6,NCT03496012,Efficacy and Safety of AAV2-REP1 for the Treat...,BIIB-111,Recruiting,14.0,AAV2,Yes,Yes,Subretinal,Ophthalmology,Phase 3,31/03/2020
4,7,NCT03252847,Gene Therapy for X-linked Retinitis Pigmentosa...,A-004,Recruiting,36.0,AAV2,,,,Ophthalmology,??,1/11/2020


In [3]:
df.columns

Index(['AAVDB', 'NCT Number', 'Title', 'Drug ID', 'Status', 'No of pts',
       'Vector', 'Safety met', 'Efficacy met',
       'Administration Therapeutic route ', 'Administration Therapeutic Area',
       'Phases', 'Primary Completion'],
      dtype='object')

Data Cleaning

In [4]:
data = df.drop(columns=['AAVDB', 'NCT Number'])
data

Unnamed: 0,Title,Drug ID,Status,No of pts,Vector,Safety met,Efficacy met,Administration Therapeutic route,Administration Therapeutic Area,Phases,Primary Completion
0,Safety and Dose Escalation Study of AAV2-hCHM ...,SPK-7001,"Active, not recruiting",10.0,AAV2,Yes,No,Subretinal,Ophthalmology,Phase 1/2,1/10/2019
1,Trial of AAV5-hFIX in Severe or Moderately Sev...,AMT-061,"Active, not recruiting",10.0,AAV5,Yes,Yes,Intravenous,Hematology,Phase 1/2,1/5/2021
2,Clinical Safety and Preliminary Efficacy of AA...,,Unknown status,20.0,,,,,Oncology,Phase 1,1/10/2016
3,Efficacy and Safety of AAV2-REP1 for the Treat...,BIIB-111,Recruiting,14.0,AAV2,Yes,Yes,Subretinal,Ophthalmology,Phase 3,31/03/2020
4,Gene Therapy for X-linked Retinitis Pigmentosa...,A-004,Recruiting,36.0,AAV2,,,,Ophthalmology,??,1/11/2020
...,...,...,...,...,...,...,...,...,...,...,...
147,Safety and Efficacy Study of rAAV.sFlt-1 in Pa...,AVA-101,Completed,40.0,AAV2,,,,Ophthalmology,Phase 1/2,1/5/2017
148,Gene Therapy Clinical Study in Adult PKU,HMI-102,Recruiting,21.0,AAVHSC15,,,Intravenous,Metabolic,Phase 1/2,1/6/2021
149,Dose-escalation Study to Evaluate the Safety a...,GS-030,Recruiting,18.0,AAV2/7m8,,,,Ophthalmology,Phase 1/2,1/9/2020
150,Phase 1 Dose Escalation Study of Intra-Articul...,tgAAC94,Completed,15.0,AAV2,Yes,No,,Autoimmune,Phase 1,1/11/2005


Removing Null Values

In [5]:
data["Efficacy met"].unique()

array(['No', 'Yes', nan, 'Yes but comparable to placebo in other studies',
       'Yes??', 'Preliminary', 'yes'], dtype=object)

In [6]:

dates = data["Primary Completion"]

for x in range(len(data)):
    date = dates[x]
    date = str(date).split("/")
    data["Primary Completion"][x] = date[2]

data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Primary Completion"][x] = date[2]


Unnamed: 0,Title,Drug ID,Status,No of pts,Vector,Safety met,Efficacy met,Administration Therapeutic route,Administration Therapeutic Area,Phases,Primary Completion
0,Safety and Dose Escalation Study of AAV2-hCHM ...,SPK-7001,"Active, not recruiting",10.0,AAV2,Yes,No,Subretinal,Ophthalmology,Phase 1/2,2019
1,Trial of AAV5-hFIX in Severe or Moderately Sev...,AMT-061,"Active, not recruiting",10.0,AAV5,Yes,Yes,Intravenous,Hematology,Phase 1/2,2021
2,Clinical Safety and Preliminary Efficacy of AA...,,Unknown status,20.0,,,,,Oncology,Phase 1,2016
3,Efficacy and Safety of AAV2-REP1 for the Treat...,BIIB-111,Recruiting,14.0,AAV2,Yes,Yes,Subretinal,Ophthalmology,Phase 3,2020
4,Gene Therapy for X-linked Retinitis Pigmentosa...,A-004,Recruiting,36.0,AAV2,,,,Ophthalmology,??,2020
...,...,...,...,...,...,...,...,...,...,...,...
147,Safety and Efficacy Study of rAAV.sFlt-1 in Pa...,AVA-101,Completed,40.0,AAV2,,,,Ophthalmology,Phase 1/2,2017
148,Gene Therapy Clinical Study in Adult PKU,HMI-102,Recruiting,21.0,AAVHSC15,,,Intravenous,Metabolic,Phase 1/2,2021
149,Dose-escalation Study to Evaluate the Safety a...,GS-030,Recruiting,18.0,AAV2/7m8,,,,Ophthalmology,Phase 1/2,2020
150,Phase 1 Dose Escalation Study of Intra-Articul...,tgAAC94,Completed,15.0,AAV2,Yes,No,,Autoimmune,Phase 1,2005


In [7]:
data.columns

Index(['Title', 'Drug ID', 'Status', 'No of pts', 'Vector', 'Safety met',
       'Efficacy met', 'Administration Therapeutic route ',
       'Administration Therapeutic Area', 'Phases', 'Primary Completion'],
      dtype='object')

In [8]:
df = data.dropna(subset=[
    "Efficacy met",
    "Vector",
    "Safety met",
    "Administration Therapeutic route ",
    "Administration Therapeutic Area",
    "Phases",
    "Drug ID"
])
df.head()

Unnamed: 0,Title,Drug ID,Status,No of pts,Vector,Safety met,Efficacy met,Administration Therapeutic route,Administration Therapeutic Area,Phases,Primary Completion
0,Safety and Dose Escalation Study of AAV2-hCHM ...,SPK-7001,"Active, not recruiting",10.0,AAV2,Yes,No,Subretinal,Ophthalmology,Phase 1/2,2019
1,Trial of AAV5-hFIX in Severe or Moderately Sev...,AMT-061,"Active, not recruiting",10.0,AAV5,Yes,Yes,Intravenous,Hematology,Phase 1/2,2021
3,Efficacy and Safety of AAV2-REP1 for the Treat...,BIIB-111,Recruiting,14.0,AAV2,Yes,Yes,Subretinal,Ophthalmology,Phase 3,2020
11,AAV2-GDNF for Advanced Parkinson?s Disease,AMT-090,"Active, not recruiting",13.0,AAV2,Yes,Yes,Intracranial,Neurology,Phase 1,2022
13,A Study of AAV-hAADC-2 in Subjects With Parkin...,VYAADC-01,Completed,10.0,AAV1,Yes,Yes but comparable to placebo in other studies,Intracranial,Neurology,Phase 1,2013


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 146
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Title                              60 non-null     object 
 1   Drug ID                            60 non-null     object 
 2   Status                             60 non-null     object 
 3   No of pts                          60 non-null     float64
 4   Vector                             60 non-null     object 
 5   Safety met                         60 non-null     object 
 6   Efficacy met                       60 non-null     object 
 7   Administration Therapeutic route   60 non-null     object 
 8   Administration Therapeutic Area    60 non-null     object 
 9   Phases                             60 non-null     object 
 10  Primary Completion                 60 non-null     object 
dtypes: float64(1), object(10)
memory usage: 5.6+ KB


# Lable Encoding the Eficacy Met COlumn

In [15]:
y = data["Efficacy met"]
y.unique()

array(['No', 'Yes', nan, 'Yes but comparable to placebo in other studies',
       'Yes??', 'Preliminary', 'yes'], dtype=object)

In [None]:
# encoder = LabelEncoder()
# eff = encoder.fit_transform(data["Efficacy met"])
# eff

In [None]:
df.to_csv("cleaned_dataset.csv")