In [2]:
! pip install nltk
import pandas as pd

import nltk
from nltk.corpus import wordnet as wn

# Download WordNet data (if you haven't already done so)
nltk.download('wordnet')

import re




[nltk_data] Downloading package wordnet to /Users/tomcio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df = pd.read_csv('/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/drugbank_vocabulary_RAW.csv')

# extract the drug names from the dataframe
drug_names = df[['Common name']]

# rename the column to 'Drug'
# before renaming, the dataframe has 16575 rows
drug_names = drug_names.rename(columns={'Common name': 'Name'})
drug_names = drug_names.sort_values(by='Name')
print(drug_names.size)

# save drug_names to a csv file
drug_names.to_csv('/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/drugbank_names_unfiltered.csv', index=False)

# Filter out rows where 'DrugName' contains any number or space
drug_names_filtered = drug_names[~drug_names['Name'].str.contains('\d|\s')].reset_index(drop=True)

# save drug_names_filtered to a csv file
drug_names_filtered.to_csv('/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/drugbank_names_filtered.csv', index=False)


# after cutting down names w numbers + multiple terms, the dataframe has 7099 rows
drug_names_filtered


16575


Unnamed: 0,Name
0,(+)-menthol
1,(-)-beta-Elemene
2,(Hydroxyethyloxy)Tri(Ethyloxy)Octane
3,(R)-Atenolol
4,(R)-Bicalutamide
...,...
7094,tgAAVCF
7095,α-Methylacetylfentanyl
7096,α-Methylfentanyl
7097,β-Hydroxythiofentanyl


In [4]:
# open international_drug_dictionary.xlsx in a pandas dataframe
df2 = pd.read_csv('/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/international_drug_dictionary_RAW.csv')

# Set the column names to the values of the first row
df2.columns = df2.iloc[0]
df2 = df2.drop(df2.index[0]) # drop the first row
df2 = df2.reset_index(drop=True) # reset index

# extract the drug names from the dataframe
drug_names2 = df2[['STR']]

# rename the column to 'Drug'
# before renaming, the dataframe has 451300 rows
drug_names2 = drug_names2.rename(columns={'STR': 'Name'})
drug_names2 = drug_names2.sort_values(by='Name')

# Remove duplicates, now it ha 11758 rows
drug_names2 = drug_names2.drop_duplicates()
drug_names2

# save drug_names to a csv file
drug_names2.to_csv('/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/international_drug_dictionary_unfiltered.csv', index=False)

# Filter out rows where 'DrugName' contains any number or space + reset index --> now it has 4673 rows
drug_names_filtered2 = drug_names2[~drug_names2['Name'].str.contains('\d|\s')].reset_index(drop=True)

# save drug_names_filtered to a csv file
drug_names_filtered2.to_csv('/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/international_drug_dictionary_filtered.csv', index=False)

# Display the DataFrame
drug_names_filtered2


  df2 = pd.read_csv('/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/international_drug_dictionary_RAW.csv')


Unnamed: 0,Name
0,(R)-Praziquantel
1,(S)-Propafenone
2,(S)-Warfarin
3,.gamma.-tocopherol
4,ACONITUM
...,...
4668,zonisamide
4669,zopiclone
4670,zorubicin
4671,zotepine


In [5]:
# Step 1: Concatenate the two DataFrames
combined_df = pd.concat([drug_names_filtered, drug_names_filtered2])

# Step 2: Remove duplicates to ensure all drug names are unique
unique_combined_df = combined_df.drop_duplicates(subset='Name')

# sort the dataframe by 'Name'
unique_combined_df = unique_combined_df.sort_values(by='Name')

# unique_combined_df to lowercase
unique_combined_df['Name'] = unique_combined_df['Name'].str.lower()

# Handle Missing Values
unique_combined_df.dropna(subset=['Name'], inplace=True)

# Remove Duplicates
unique_combined_df.drop_duplicates(subset='Name', inplace=True)

# reset index
unique_combined_df = unique_combined_df.reset_index(drop=True)

# save it to a csv file
unique_combined_df.to_csv('/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/FINAL_drug_names.csv', index=False)

unique_combined_df


Unnamed: 0,Name
0,(+)-menthol
1,(-)-beta-elemene
2,(hydroxyethyloxy)tri(ethyloxy)octane
3,(r)-atenolol
4,(r)-bicalutamide
...,...
8025,yttrium
8026,α-methylacetylfentanyl
8027,α-methylfentanyl
8028,β-hydroxythiofentanyl


### Get a dictionary from wordnet

In [6]:
# Extract words from WordNet
words = set()
for synset in wn.all_synsets():
    for lemma in synset.lemmas():
        word = lemma.name()
        # Filter words to include only alphabetic characters and avoid plurals heuristically
        if re.match(r'^[a-zA-Z]+$', word) and not word.endswith('s'):
            words.add(word)

# Convert the set of words to a list (to remove duplicates)
words_list = list(words)

# Create a single-column DataFrame from the list of words
df_words = pd.DataFrame(words_list, columns=['Word'])

# sort the dataframe by 'Word' alphabetically and reset index
df_words = df_words.sort_values(by='Word').reset_index(drop=True)

# save it to a csv file
df_words.to_csv('/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/wordnet_words.csv', index=False)

df_words

Unnamed: 0,Word
0,A
1,AA
2,AAA
3,AARP
4,AAS
...,...
70720,zymoid
70721,zymology
70722,zymolytic
70723,zymotic


### Merging dictionary with drug names

In [7]:
# load the data
path = '/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/'
drug_names = pd.read_csv(path + 'FINAL_drug_names.csv')
dictionary = pd.read_csv(path + 'wordnet_words.csv')

In [8]:
# rename Word to Name in dictionary
dictionary = dictionary.rename(columns={'Word': 'Name'})
drug_names['label'] = 1  # Label for drug names
dictionary['label'] = 0  # Label for non-drug names

In [9]:
combined_df = pd.concat([drug_names, dictionary], ignore_index=True)

# Step 2: Sort the DataFrame by 'name' and 'label' in descending order to ensure label 1 is on top
combined_df = combined_df.sort_values(by=['Name', 'label'], ascending=[True, False]).reset_index(drop=True)

# Step 3: Drop duplicates based on the 'name' column, keeping the first occurrence
combined_df = combined_df.drop_duplicates(subset='Name', keep='first')

# remove duplicates
combined_df = combined_df.dropna(subset=['Name'])

# save the dataframe
combined_df.to_csv(path + 'training_data_RAW.csv', index=False)

combined_df

Unnamed: 0,Name,label
0,(+)-menthol,1
1,(-)-beta-elemene,1
2,(hydroxyethyloxy)tri(ethyloxy)octane,1
3,(r)-acetoin,1
4,(r)-atenolol,1
...,...,...
78748,zytron,1
78749,α-methylacetylfentanyl,1
78750,α-methylfentanyl,1
78751,β-hydroxythiofentanyl,1


### Craeting another dataset without special symbols

In [11]:
combined_df_clean = combined_df

def clean_string(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Removes symbols

# Apply the function to the 'Name' column
combined_df_clean['Name'] = combined_df_clean['Name'].apply(clean_string)

# save the dataframe
combined_df_clean.to_csv(path + 'training_data_CLEAN.csv', index=False)

combined_df_clean



Unnamed: 0,Name,label
0,menthol,1
1,betaelemene,1
2,hydroxyethyloxytriethyloxyoctane,1
3,racetoin,1
4,ratenolol,1
...,...,...
78748,zytron,1
78749,methylacetylfentanyl,1
78750,methylfentanyl,1
78751,hydroxythiofentanyl,1
