In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE


In [18]:
path = '/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/'
data = pd.read_csv(path + 'training_data_RAW.csv')

In [20]:

# Create a label encoder for all characters in the dataset
all_characters = set(''.join(data['Name']))
label_encoder = LabelEncoder()
label_encoder.fit(list(all_characters))

# Function to encode a word into a list of integers
def encode_word(word):
    return label_encoder.transform(list(word))

# Encode all words in the dataset
data['encoded'] = data['Name'].apply(lambda word: np.array(encode_word(word)))

# Pad encoded sequences with zeros to ensure uniform length
max_length = max(data['encoded'].apply(len))
data['padded'] = data['encoded'].apply(lambda x: np.concatenate([x, np.zeros(max_length - len(x))]))

# Prepare the dataset for training
X = np.stack(data['padded'].values)
y = data['label'].values

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Initialize and train the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train_smote, y_train_smote)

# Make predictions on the validation set
y_pred = mlp.predict(X_val)

# Calculate and print various performance metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
f1 = f1_score(y_val, y_pred, average='binary')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

# Function to make predictions on new words
def predict_new_word(model, new_word):
    encoded = np.array(encode_word(new_word))
    padded = np.concatenate([encoded, np.zeros(max_length - len(encoded))])
    prediction = model.predict([padded])
    return prediction[0]

# Example prediction
new_word = "Aspirin"
prediction = predict_new_word(mlp, new_word)
print(f'The word "{new_word}" is predicted to be a drug name with probability: {prediction}')

# Modified function to check if words are in the dataset before predicting
def predict_and_print(words, model, data):
    for word in words:
        # Check if the word is in the dataset
        if word.lower() in data['Name'].str.lower().values:
            print(f'The word "{word}" was found in the dataset and will be skipped.')
        else:
            prediction = predict_new_word(model, word)
            print(f'The word "{word}" is predicted to be a drug name with probability: {prediction}')


# Additional words for prediction
additional_words = [
    "Paracetamol",  # Common medication
    "Tromboner",    # Common medication
    "Rivaroxaban",    # Common medication for diabetes
    "Icatibant",  # Common medication for cholesterol
    "Oxurion",   # Common medication for high blood pressure
    "marta", # Antibiotic
    "pupka",# Common in medication names but not a medication itself
    "schladming",    # Not a medication
    "Sunshine",     # Not a medication
    "Technology"    # Not a medication
]

# Use the modified function with the additional_words list and the original dataset
predict_and_print(additional_words, mlp, data)

Accuracy: 0.7918807810894142
Precision: 0.29303845187232974
Recall: 0.7319522912743252
F1-score: 0.41852117731514715
The word "Aspirin" is predicted to be a drug name with probability: 0
The word "Paracetamol" is predicted to be a drug name with probability: 0
The word "Tromboner" is predicted to be a drug name with probability: 0
The word "Rivaroxaban" was found in the dataset and will be skipped.
The word "Icatibant" was found in the dataset and will be skipped.
The word "Oxurion" is predicted to be a drug name with probability: 0
The word "marta" is predicted to be a drug name with probability: 0
The word "pupka" is predicted to be a drug name with probability: 0
The word "schladming" is predicted to be a drug name with probability: 1
The word "Sunshine" was found in the dataset and will be skipped.
The word "Technology" was found in the dataset and will be skipped.


In [22]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

# Load the dataset
path = '/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/'
data = pd.read_csv(path + 'training_data_RAW.csv')

# Define a function to transform the dataset
def transform_dataset(X):
    return [' '.join(list(word)) for word in X]  # Separate characters by spaces

# Initialize a pipeline with a custom transformer and TfidfVectorizer
pipeline = make_pipeline(
    FunctionTransformer(transform_dataset, validate=False),
    TfidfVectorizer(analyzer='char', ngram_range=(1, 3)),
    SVC(kernel='linear', class_weight='balanced', probability=True)
)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(data['Name'], data['label'], test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Calculate and print various performance metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
f1 = f1_score(y_val, y_pred, average='binary')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

# Function to make predictions on new words
def predict_new_word(model, new_word):
    prediction = model.predict([new_word])[0]
    probability = model.predict_proba([new_word])[0][prediction]
    return prediction, probability

# Test the model with new words
test_words = ["Aspirin", "Paracetamol", "Tromboner", "Oxurion", "marta", "pupka", "schladming"]
for word in test_words:
    prediction, probability = predict_new_word(pipeline, word)
    print(f'The word "{word}" is predicted to be a drug name with probability: {probability:.4f}')
