#Data Cleaning
  - Drop Rows where interactions list is empty
  - extract names from links / delete extra noise (links that have nothing to do w/ interactions) (regex?)
  - noise --> strings containing anything but drug names before the .html
  ex. 'sickness', 'pregnancy', 'symptoms', 'image', disease names

In [None]:
import pandas as pd
import os
import re

In [None]:
df.rename(columns={'drug name': 'query_drug_link', 'list of drugs that interact': 'interaction_drug_link'}, inplace=True)
df

Unnamed: 0,query_drug_link,interaction_drug_link
0,https://www.drugs.com/cons/a-b-otic.html,
1,https://www.drugs.com/mtm/abacavir.html,"https://www.drugs.com/mtm/riociguat.html, http..."
2,https://www.drugs.com/mtm/abacavir-and-lamivud...,
3,https://www.drugs.com/pro/abacavir-and-lamivud...,
4,https://www.drugs.com/pro/abacavir-oral-soluti...,
...,...,...
89248,https://www.drugs.com/pro/zyvana.html,
89249,https://www.drugs.com/zyvox.html,
89250,https://www.drugs.com/cons/zyvox-linezolid-int...,
89251,https://www.drugs.com/cons/zyvox-linezolid-ora...,


In [None]:
def get_drug_names(row, pattern):
    drug_name = row['drug_names']
    url = row['query_drug_link']

    if drug_name is None:
        matched_result = re.match(pattern, url)
        if matched_result:
            matched_string = matched_result.group(1)  # Access the first captured group
            return matched_string
        else:
            return None

In [None]:
def get_drug_names_all(row, pattern):
    drug_name = row['interaction_drug_names']
    url = row['interaction_drug_link']

    if drug_name is None:
        if pd.isna(url):
            return None
        else:
            result = re.findall(pattern, url)
            if result:
                return result
            else:
                return None

In [None]:
def incorrect_information_removal(row):
    drug_names = row['interaction_drug_names']
    words_to_remove = ['pregnancy', 'disease', 'illness', 'sickness', 'person', 'image', 'diabetes', 'hyperglycemia', 'side-effects', "side effects"]

    if drug_names is None:
        return []

    updated_drug_names = []
    updated_drug_names = [drug for drug in drug_names if not any(word in drug.lower() for word in words_to_remove)]

    return updated_drug_names if updated_drug_names else []

In [None]:
# drug_name = df['interaction_drug_names'][1]
# isinstance(drug_name, list)

In [None]:
# pattern_1 = r"https?:\/\/www\.drugs\.com\/[a-z]+\/([a-zA-Z0-9-]+)\.html?"
# pattern_2 = r'https?:\/\/www\.drugs\.com\/([a-zA-Z0-9-]+)\.html?'
# pattern_3 = r'https?:\/\/www\.drugs\.com\/(?:[a-z]+\/)?([a-zA-Z0-9-]+)\.html?'
pattern_4 = r'https?:\/\/www\.drugs\.com\/(?:[a-z]+\/)?([a-zA-Z0-9_-]+)\.html?'


df['drug_names'] = None
df['interaction_drug_names'] = None

for index, row in df.iterrows():
    matched_name = get_drug_names(row, pattern_4)
    df.loc[index, 'drug_names'] = matched_name

    matched_name = get_drug_names_all(row, pattern_4)
    df.at[index, 'interaction_drug_names'] = matched_name

df['interaction_drug_names'] = df.apply(incorrect_information_removal, axis=1)

df

Unnamed: 0,query_drug_link,interaction_drug_link,drug_names,interaction_drug_names
0,https://www.drugs.com/cons/a-b-otic.html,,a-b-otic,[]
1,https://www.drugs.com/mtm/abacavir.html,"https://www.drugs.com/mtm/riociguat.html, http...",abacavir,"[riociguat, methadone]"
2,https://www.drugs.com/mtm/abacavir-and-lamivud...,,abacavir-and-lamivudine,[]
3,https://www.drugs.com/pro/abacavir-and-lamivud...,,abacavir-and-lamivudine-tablets,[]
4,https://www.drugs.com/pro/abacavir-oral-soluti...,,abacavir-oral-solution,[]
...,...,...,...,...
89248,https://www.drugs.com/pro/zyvana.html,,zyvana,[]
89249,https://www.drugs.com/zyvox.html,,zyvox,[]
89250,https://www.drugs.com/cons/zyvox-linezolid-int...,,zyvox-linezolid-intravenous,[]
89251,https://www.drugs.com/cons/zyvox-linezolid-ora...,,zyvox-linezolid-oral,[]


In [None]:
# Define the directory path
directory = '/content/drive/MyDrive/drug_interactions/database'

# Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# Define the full file path
file_path = os.path.join(directory, 'preprocessed_drug_interactions_db.csv')

# Save the DataFrame to the file
df.to_csv(file_path)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/drug_interactions/database/preprocessed_drug_interactions_db.csv', index_col=0)
df

Unnamed: 0,query_drug_link,interaction_drug_link,drug_names,interaction_drug_names
0,https://www.drugs.com/cons/a-b-otic.html,,a-b-otic,[]
1,https://www.drugs.com/mtm/abacavir.html,"https://www.drugs.com/mtm/riociguat.html, http...",abacavir,"['riociguat', 'methadone']"
2,https://www.drugs.com/mtm/abacavir-and-lamivud...,,abacavir-and-lamivudine,[]
3,https://www.drugs.com/pro/abacavir-and-lamivud...,,abacavir-and-lamivudine-tablets,[]
4,https://www.drugs.com/pro/abacavir-oral-soluti...,,abacavir-oral-solution,[]
...,...,...,...,...
89248,https://www.drugs.com/pro/zyvana.html,,zyvana,[]
89249,https://www.drugs.com/zyvox.html,,zyvox,[]
89250,https://www.drugs.com/cons/zyvox-linezolid-int...,,zyvox-linezolid-intravenous,[]
89251,https://www.drugs.com/cons/zyvox-linezolid-ora...,,zyvox-linezolid-oral,[]
