## EDA and Data Cleaning


In [1]:
import pandas as pd

# Data taken from: https://www.kaggle.com/datasets/rajgupta2019/druggie
df = pd.read_csv("data/train.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32165 entries, 0 to 32164
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   patient_id                  32165 non-null  int64  
 1   name_of_drug                32165 non-null  object 
 2   use_case_for_drug           32165 non-null  object 
 3   review_by_patient           32165 non-null  object 
 4   effectiveness_rating        32165 non-null  int64  
 5   drug_approved_by_UIC        32165 non-null  object 
 6   number_of_times_prescribed  32165 non-null  int64  
 7   base_score                  32165 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 2.0+ MB


In [2]:
# Check first few rows of data
df.head(10)

Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,8.022969
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,7.858458
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,6.341969
3,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,6.590176
4,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,6.144782
5,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8,8-Dec-16,1,5.691792
6,75612,L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10,9-Mar-17,54,8.563274
7,219869,Amitriptyline,ibromyalgia,"""I&#039;ve been taking amitriptyline since Jan...",9,15-Mar-17,39,7.301039
8,212077,Lamotrigine,Bipolar Disorde,"""I&#039;ve been on every medicine under the su...",10,9-Nov-14,18,6.63397
9,12372,Atripla,HIV Infection,"""Spring of 2008 I was hospitalized with pnuemo...",8,9-Jul-10,11,6.562116


In [3]:
# Check for missing values
df.isna().sum()

patient_id                    0
name_of_drug                  0
use_case_for_drug             0
review_by_patient             0
effectiveness_rating          0
drug_approved_by_UIC          0
number_of_times_prescribed    0
base_score                    0
dtype: int64

In [4]:
# Create text column
df["text"] = df["use_case_for_drug"] + " " + df["review_by_patient"]
# Bin effectiveness rating (1 for positive, 0 for negative)
df["label"] = df["effectiveness_rating"].apply(lambda x: 1 if x > 6 else 0)

df = df[["text", "label"]]

In [5]:
import re

# Remove numbers, special characters, and text in parentheses
df["text"] = df["text"].map(lambda s: re.sub(r"\([^\(\)]*\)|[^\w ]|\w*\d+\w*", " ", s))
# Remove extra spaces
df["text"] = df["text"].map(lambda s: re.sub(r" +", " ", s))
# Convert to lower case
df["text"] = df["text"].map(lambda s: s.strip().lower())

In [6]:
# Drop duplicates
df = df.drop_duplicates(subset=["text"])

In [7]:
# Data is imbalanced, oversampling will be done later when the text is vectorised
df["label"].value_counts()

label
1    20012
0    10150
Name: count, dtype: int64

In [8]:
# Show cleaned data
print(f"No. of rows: {df.shape[0]}")
df.head(10)

No. of rows: 30162


Unnamed: 0,text,label
0,left ventricular dysfunction it has no side ef...,1
1,adhd my son is halfway through his fourth week...,1
2,birth control i used to take another oral cont...,0
3,opiate dependence suboxone has completely turn...,1
4,benign prostatic hyperplasia day on started to...,0
5,birth control i had been on the pill for many ...,1
6,depression i have taken anti depressants for y...,1
7,ibromyalgia i ve been taking amitriptyline sin...,1
8,bipolar disorde i ve been on every medicine un...,1
9,hiv infection spring of i was hospitalized wit...,1


In [9]:
# Save cleaned data
df.to_csv("data/clean.csv", index=False)