In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("./cleaned_dataset.csv").drop(columns=["Unnamed: 0"])
df.head(2)

Unnamed: 0,title,drug_id,status,points,vector,safety_met,efficacy_met,administration_therapeutic_route,administration_therapeutic_area,phases,primary_completion
0,Safety and Dose Escalation Study of AAV2-hCHM ...,SPK-7001,"Active, not recruiting",10.0,AAV2,Yes,No,Subretinal,Ophthalmology,Phase 1/2,2019
1,Trial of AAV5-hFIX in Severe or Moderately Sev...,AMT-061,"Active, not recruiting",10.0,AAV5,Yes,Yes,Intravenous,Hematology,Phase 1/2,2021


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   title                              60 non-null     object 
 1   drug_id                            60 non-null     object 
 2   status                             60 non-null     object 
 3   points                             60 non-null     float64
 4   vector                             60 non-null     object 
 5   safety_met                         60 non-null     object 
 6   efficacy_met                       60 non-null     object 
 7   administration_therapeutic_route   60 non-null     object 
 8   administration_therapeutic_area    60 non-null     object 
 9   phases                             60 non-null     object 
 10  primary_completion                 60 non-null     int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 5.3+ KB


In [4]:
df.efficacy_met.unique()

array(['No', 'Yes'], dtype=object)

## Labling the data

In [5]:

encoder = LabelEncoder()

data_before = df.efficacy_met
data_to_label = df.efficacy_met

df.efficacy_met = encoder.fit_transform(data_to_label)
df.safety_met = encoder.fit_transform(data_to_label)


pd.DataFrame({
    "Label" : data_before,
    "Label When Encoded": df.efficacy_met
}).head(3)

Unnamed: 0,Label,Label When Encoded
0,No,0
1,Yes,1
2,Yes,1


In [6]:
df.columns

Index(['title', 'drug_id', 'status', 'points', 'vector', 'safety_met',
       'efficacy_met', 'administration_therapeutic_route ',
       'administration_therapeutic_area', 'phases', 'primary_completion'],
      dtype='object')

In [7]:
df = pd.get_dummies(df, columns=[
    "status", "administration_therapeutic_route ", "administration_therapeutic_area",
    "phases", "vector", "drug_id"
])

df.head(1)

Unnamed: 0,title,points,safety_met,efficacy_met,primary_completion,"status_Active, not recruiting",status_Completed,status_Recruiting,status_Terminated,status_Unknown status,...,drug_id_Valrox,drug_id_Zolgensma,drug_id_rAAV 2/2. hRPE65p. hRPE65,drug_id_rAAV1-CMV- hGAA,drug_id_rAAV1-PG9DP,drug_id_rAAV1.CMV.huFollistatin344,drug_id_rAAV2-CB- hRPE65,drug_id_rAAV2-CBSB- hRPE65,drug_id_rAAV2.REP1,drug_id_scAAV2- P1ND4v2
0,Safety and Dose Escalation Study of AAV2-hCHM ...,10.0,0,0,2019,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Splitting Data

In [8]:
X = df.drop(columns=["efficacy_met", "title"])
y = df.efficacy_met

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=45)

X.head(2)

Unnamed: 0,points,safety_met,primary_completion,"status_Active, not recruiting",status_Completed,status_Recruiting,status_Terminated,status_Unknown status,administration_therapeutic_route _Intraarticular,administration_therapeutic_route _Intracoronary,...,drug_id_Valrox,drug_id_Zolgensma,drug_id_rAAV 2/2. hRPE65p. hRPE65,drug_id_rAAV1-CMV- hGAA,drug_id_rAAV1-PG9DP,drug_id_rAAV1.CMV.huFollistatin344,drug_id_rAAV2-CB- hRPE65,drug_id_rAAV2-CBSB- hRPE65,drug_id_rAAV2.REP1,drug_id_scAAV2- P1ND4v2
0,10.0,0,2019,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10.0,1,2021,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X.columns

Index(['points', 'safety_met', 'primary_completion',
       'status_Active, not recruiting', 'status_Completed',
       'status_Recruiting', 'status_Terminated', 'status_Unknown status',
       'administration_therapeutic_route _Intraarticular',
       'administration_therapeutic_route _Intracoronary',
       'administration_therapeutic_route _Intracranial',
       'administration_therapeutic_route _Intramuscular',
       'administration_therapeutic_route _Intranasal',
       'administration_therapeutic_route _Intraparotidal',
       'administration_therapeutic_route _Intrathecal',
       'administration_therapeutic_route _Intravenous',
       'administration_therapeutic_route _Intravitreal',
       'administration_therapeutic_route _Subfoveal',
       'administration_therapeutic_route _Subretinal',
       'administration_therapeutic_area_Autoimmune',
       'administration_therapeutic_area_Cardiovascular',
       'administration_therapeutic_area_Hematology',
       'administration_the

In [10]:
len(df.title.unique())

60

# Model

In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)

"Score: " + str(model.score(X_test, y_test))


'Score: 1.0'

# Exporting Model


In [12]:
import joblib 

joblib.dump(model, "./models/logistic-reg.joblib")

['./models/logistic-reg.joblib']