In [83]:
! pip install nltk

import pandas as pd

import nltk
from nltk.corpus import wordnet as wn

# Download WordNet data (if you haven't already done so)
nltk.download('wordnet')

import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score





[nltk_data] Downloading package wordnet to /Users/tomcio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
# load the data
path = '/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/'
drug_names = pd.read_csv(path + 'FINAL_drug_names.csv')
dictionary = pd.read_csv(path + 'wordnet_words.csv')

In [59]:
# rename Word to Name in dictionary
dictionary = dictionary.rename(columns={'Word': 'Name'})
drug_names['label'] = 1  # Label for drug names
dictionary['label'] = 0  # Label for non-drug names

In [86]:
combined_df = pd.concat([drug_names, dictionary], ignore_index=True)

# Step 2: Sort the DataFrame by 'name' and 'label' in descending order to ensure label 1 is on top
combined_df = combined_df.sort_values(by=['Name', 'label'], ascending=[True, False]).reset_index(drop=True)

# Step 3: Drop duplicates based on the 'name' column, keeping the first occurrence
combined_df = combined_df.drop_duplicates(subset='Name', keep='first')

# remove duplicates
combined_df = combined_df.dropna(subset=['Name'])

# save the dataframe
combined_df.to_csv(path + 'training_data_RAW.csv', index=False)

combined_df

Unnamed: 0,Name,label
0,(+)-menthol,1
1,(-)-beta-elemene,1
2,(hydroxyethyloxy)tri(ethyloxy)octane,1
3,(r)-acetoin,1
4,(r)-atenolol,1
...,...,...
78748,zytron,1
78749,α-methylacetylfentanyl,1
78750,α-methylfentanyl,1
78751,β-hydroxythiofentanyl,1


### Preprocessing

In [78]:
df = combined_df

#rename Name to processed_text
df = df.rename(columns={'Name': 'processed_text'})

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df['processed_text'], df['label'], test_size=0.2, random_state=42)


### Embedding

In [84]:
combined_df

Unnamed: 0,Name,label
0,(+)-menthol,1
1,(-)-beta-elemene,1
2,(hydroxyethyloxy)tri(ethyloxy)octane,1
3,(r)-acetoin,1
4,(r)-atenolol,1
...,...,...
78748,zytron,1
78749,α-methylacetylfentanyl,1
78750,α-methylfentanyl,1
78751,β-hydroxythiofentanyl,1


In [88]:
# apply nltk.word_tokenize to Name column in combined_df
combined_df_tokenized = combined_df['Name'].apply(nltk.word_tokenize)
combined_df_tokenized


0                                      [(, +, ), -menthol]
1                                 [(, -, ), -beta-elemene]
2        [(, hydroxyethyloxy, ), tri, (, ethyloxy, ), o...
3                                      [(, r, ), -acetoin]
4                                     [(, r, ), -atenolol]
                               ...                        
78748                                             [zytron]
78749                             [α-methylacetylfentanyl]
78750                                   [α-methylfentanyl]
78751                              [β-hydroxythiofentanyl]
78752                                   [β-methylfentanyl]
Name: Name, Length: 77840, dtype: object

### Feature Engineering

In [45]:
# Basic feature extraction
combined_df['length'] = combined_df['Name'].apply(len)
vowels = set('aeiouAEIOU')
combined_df['vowel_count'] = combined_df['Name'].apply(lambda x: sum(1 for char in x if char in vowels))
combined_df['consonant_count'] = combined_df['length'] - combined_df['vowel_count']

# Define the function for counting numeric characters
def regex_numeric_count(name, pattern=r'[0-9αβγδ]'):
    return len(re.findall(pattern, name))

# Apply this function to the 'Name' column of your DataFrame
combined_df['numeric_count'] = combined_df['Name'].apply(regex_numeric_count)
combined_df

# You can add more sophisticated features here based on your analysis



Unnamed: 0,Name,label,length,vowel_count,consonant_count,numeric_count
0,(+)-menthol,1,11,2,9,0
1,(-)-beta-elemene,1,16,6,10,0
2,(hydroxyethyloxy)tri(ethyloxy)octane,1,36,9,27,0
3,(r)-acetoin,1,11,4,7,0
4,(r)-atenolol,1,12,4,8,0
...,...,...,...,...,...,...
78748,zytron,1,6,1,5,0
78749,α-methylacetylfentanyl,1,22,5,17,1
78750,α-methylfentanyl,1,16,3,13,1
78751,β-hydroxythiofentanyl,1,21,5,16,1


In [43]:
import re

def regex_numeric_count(name, pattern=r'[0-9αβγδ]'):
    return len(re.findall(pattern, name))

# Test the regex function
test_name = "α-methylacetylfentanyl"
test_count = regex_numeric_count(test_name)
print(f"Regex Numeric Count for '{test_name}': {test_count}")


Regex Numeric Count for 'α-methylacetylfentanyl': 1
