In [57]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [58]:
df = pd.read_csv("./cleaned_dataset.csv").drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,title,drug_id,status,points,vector,safety_met,efficacy_met,administration_therapeutic_route,administration_therapeutic_area,phases,primary_completion
0,Safety and Dose Escalation Study of AAV2-hCHM ...,SPK-7001,"Active, not recruiting",10.0,AAV2,Yes,No,Subretinal,Ophthalmology,Phase 1/2,2019
1,Trial of AAV5-hFIX in Severe or Moderately Sev...,AMT-061,"Active, not recruiting",10.0,AAV5,Yes,Yes,Intravenous,Hematology,Phase 1/2,2021
2,Efficacy and Safety of AAV2-REP1 for the Treat...,BIIB-111,Recruiting,14.0,AAV2,Yes,Yes,Subretinal,Ophthalmology,Phase 3,2020
3,AAV2-GDNF for Advanced Parkinson?s Disease,AMT-090,"Active, not recruiting",13.0,AAV2,Yes,Yes,Intracranial,Neurology,Phase 1,2022
4,A Study of AAV-hAADC-2 in Subjects With Parkin...,VYAADC-01,Completed,10.0,AAV1,Yes,Yes,Intracranial,Neurology,Phase 1,2013


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   title                              60 non-null     object 
 1   drug_id                            60 non-null     object 
 2   status                             60 non-null     object 
 3   points                             60 non-null     float64
 4   vector                             60 non-null     object 
 5   safety_met                         60 non-null     object 
 6   efficacy_met                       60 non-null     object 
 7   administration_therapeutic_route   60 non-null     object 
 8   administration_therapeutic_area    60 non-null     object 
 9   phases                             60 non-null     object 
 10  primary_completion                 60 non-null     int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 5.3+ KB


In [60]:
df.efficacy_met.unique()

array(['No', 'Yes'], dtype=object)

## Labling the data

In [61]:
encoder = LabelEncoder()

data_before = df.efficacy_met
data_to_label = df.efficacy_met

df.efficacy_met = encoder.fit_transform(data_to_label)
df.safety_met = encoder.fit_transform(data_to_label)


pd.DataFrame({
    "Label" : data_before,
    "Label When Encoded": df.efficacy_met
}).head(3)

Unnamed: 0,Label,Label When Encoded
0,No,0
1,Yes,1
2,Yes,1


In [62]:
df.columns

Index(['title', 'drug_id', 'status', 'points', 'vector', 'safety_met',
       'efficacy_met', 'administration_therapeutic_route ',
       'administration_therapeutic_area', 'phases', 'primary_completion'],
      dtype='object')

In [63]:
df = pd.get_dummies(df, columns=["status", "administration_therapeutic_route ", "administration_therapeutic_area", "phases", "vector", "drug_id"])
df

Unnamed: 0,title,points,safety_met,efficacy_met,primary_completion,"status_Active, not recruiting",status_Completed,status_Recruiting,status_Terminated,status_Unknown status,...,drug_id_Valrox,drug_id_Zolgensma,drug_id_rAAV 2/2. hRPE65p. hRPE65,drug_id_rAAV1-CMV- hGAA,drug_id_rAAV1-PG9DP,drug_id_rAAV1.CMV.huFollistatin344,drug_id_rAAV2-CB- hRPE65,drug_id_rAAV2-CBSB- hRPE65,drug_id_rAAV2.REP1,drug_id_scAAV2- P1ND4v2
0,Safety and Dose Escalation Study of AAV2-hCHM ...,10.0,0,0,2019,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Trial of AAV5-hFIX in Severe or Moderately Sev...,10.0,1,1,2021,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Efficacy and Safety of AAV2-REP1 for the Treat...,14.0,1,1,2020,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AAV2-GDNF for Advanced Parkinson?s Disease,13.0,1,1,2022,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A Study of AAV-hAADC-2 in Subjects With Parkin...,10.0,1,1,2013,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Dose Confirmation Trial of AAV5-hFIXco-Padua,3.0,1,1,2018,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Phase 1 Follow-on Study of AAV2-hRPE65v2 Vecto...,11.0,1,1,2026,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Hemophilia B Gene Therapy With AAV8 Vector,4.0,1,1,2016,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,Safety and Efficacy Study in Subjects With Leb...,31.0,1,1,2015,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Clinical Trial of Gene Therapy for the Treatme...,15.0,1,1,2018,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Spliting Data

In [64]:
X = df.drop(columns=["efficacy_met", "title"])
y = df.efficacy_met

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=45)

X.head(2)

Unnamed: 0,points,safety_met,primary_completion,"status_Active, not recruiting",status_Completed,status_Recruiting,status_Terminated,status_Unknown status,administration_therapeutic_route _Intraarticular,administration_therapeutic_route _Intracoronary,...,drug_id_Valrox,drug_id_Zolgensma,drug_id_rAAV 2/2. hRPE65p. hRPE65,drug_id_rAAV1-CMV- hGAA,drug_id_rAAV1-PG9DP,drug_id_rAAV1.CMV.huFollistatin344,drug_id_rAAV2-CB- hRPE65,drug_id_rAAV2-CBSB- hRPE65,drug_id_rAAV2.REP1,drug_id_scAAV2- P1ND4v2
0,10.0,0,2019,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10.0,1,2021,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
len(df.title.unique())

60

In [80]:
model = LogisticRegression()
model.fit(X_train, y_train)

"Score: " + "%" + str(model.score(X_test, y_test) * 100)


'Score: %100.0'