In [131]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


df = pd.read_csv('prescription.csv')

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Preprocess the unstructured_name column and extract features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
train_features = tfidf_vectorizer.fit_transform(train_data['unstructured_name'])
test_features = tfidf_vectorizer.transform(test_data['unstructured_name'])

# Create and train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(train_features, train_data['rx_norm'])

# Make predictions
predictions = clf.predict(test_features)

# Evaluate the model
accuracy = accuracy_score(test_data['rx_norm'], predictions)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.55


In [132]:
# Example prediction
new_input = ["syrup ibuprofen 5mls tds x5days"]
new_input_features = tfidf_vectorizer.transform(new_input)
predicted_rx_norm = clf.predict(new_input_features)
print(f"Predicted rx_norm: {predicted_rx_norm[0]}")

Predicted rx_norm: ibuprofen
