In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
file_path = '/content/drive/MyDrive/Healthcare System + Secure/Models/AI driven Drug Suggestion/medicine_dataset.csv'
data = pd.read_csv(file_path)

In [3]:
data.head()

Unnamed: 0,Name,Category,Dosage Form,Strength,Manufacturer,Indication,Classification
0,Acetocillin,Antidiabetic,Cream,938 mg,Roche Holding AG,Virus,Over-the-Counter
1,Ibuprocillin,Antiviral,Injection,337 mg,CSL Limited,Infection,Over-the-Counter
2,Dextrophen,Antibiotic,Ointment,333 mg,Johnson & Johnson,Wound,Prescription
3,Clarinazole,Antifungal,Syrup,362 mg,AbbVie Inc.,Pain,Prescription
4,Amoxicillin,Antifungal,Tablet,802 mg,Teva Pharmaceutical Industries Ltd.,Wound,Over-the-Counter


In [4]:
data_cleaned = data.drop(columns=['Manufacturer'])

In [5]:
categorical_columns = ['Category', 'Dosage Form', 'Indication', 'Classification', 'Name']
label_encoders = {}

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data_cleaned[col] = label_encoders[col].fit_transform(data_cleaned[col])

# Remove mg unit in Strngth column and convet to float
data_cleaned['Strength'] = data_cleaned['Strength'].str.replace(' mg', '').astype(float)

data_cleaned.head()

Unnamed: 0,Name,Category,Dosage Form,Strength,Indication,Classification
0,0,3,1,938.0,6,0
1,48,7,4,337.0,4,0
2,36,1,5,333.0,7,1
3,27,4,6,362.0,5,1
4,8,4,7,802.0,7,0


In [6]:
data_cleaned.tail()

Unnamed: 0,Name,Category,Dosage Form,Strength,Indication,Classification
49995,35,0,5,405.0,5,0
49996,52,6,7,633.0,5,1
49997,39,1,4,327.0,2,0
49998,38,1,3,885.0,7,0
49999,41,5,5,426.0,2,0


In [7]:
# Define X and Y
X = data_cleaned.drop(columns=['Strength'])
y = data_cleaned['Strength']


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Preform Linear Regression

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared for training data
y_train_pred = model.predict(X_train)
train_r2_score = r2_score(y_train, y_train_pred)
print(f'Training R-squared (Accuracy) Score: {train_r2_score}')

# Optional: View model coefficients and intercept
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)


Mean Squared Error: 85154.55211864052
Training R-squared (Accuracy) Score: 3.432419541660181e-05
Coefficients: [-0.05211037 -0.4626975  -0.08492209 -0.37580764 -0.08755749]
Intercept: 503.27025387797255


# Preforming all Regression types alongside

In [10]:

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to apply
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet Regression": ElasticNet(),
    "Decision Tree Regression": DecisionTreeRegressor()
}

# Function to train model, log results, and calculate metrics
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Metrics
    train_r2_score = r2_score(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    return train_r2_score, test_mse

# Iterate over each model, train it, and log performance
for model_name, model in models.items():
    train_r2, test_mse = train_and_evaluate(model, X_train, X_test, y_train, y_test)
    print(f"{model_name} - Train R-squared (Accuracy): {train_r2:.4f}, Test MSE: {test_mse:.4f}")

Linear Regression - Train R-squared (Accuracy): 0.0000, Test MSE: 85154.5521
Ridge Regression - Train R-squared (Accuracy): 0.0000, Test MSE: 85154.5521
Lasso Regression - Train R-squared (Accuracy): 0.0000, Test MSE: 85152.7443
ElasticNet Regression - Train R-squared (Accuracy): 0.0000, Test MSE: 85152.7126
Decision Tree Regression - Train R-squared (Accuracy): 0.7462, Test MSE: 162804.5565


In [11]:
drug_names = data['Name']

X_train, X_test, y_train, y_test, drug_names_train, drug_names_test = train_test_split(
    X, y, drug_names, test_size=0.2, random_state=42)

# List of models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet Regression": ElasticNet(),
    "Decision Tree Regression": DecisionTreeRegressor()
}

# Training
def train_and_evaluate(model, X_train, X_test, y_train, y_test, drug_names_test):
    model.fit(X_train, y_train)

    # Predictions on test set
    y_test_pred = model.predict(X_test)
    train_r2_score = r2_score(y_train, model.predict(X_train))
    test_mse = mean_squared_error(y_test, y_test_pred)
    results = pd.DataFrame({
        'Drug Name': drug_names_test,
        'Actual Strength': y_test,
        'Predicted Strength': y_test_pred
    })

    print(results.head())
    return train_r2_score, test_mse

for model_name, model in models.items():
    print(f"\n{model_name} Results:")
    train_r2, test_mse = train_and_evaluate(model, X_train, X_test, y_train, y_test, drug_names_test)
    print(f"{model_name} - Train R-squared (Accuracy): {train_r2:.4f}, Test MSE: {test_mse:.4f}")


Linear Regression Results:
          Drug Name  Actual Strength  Predicted Strength
33553   Acetostatin            184.0          501.199595
9427    Acetostatin            810.0          498.883472
199      Acetomycin            176.0          496.614485
12447    Dolocillin            539.0          497.236696
39489  Ibuprocillin            369.0          499.553976
Linear Regression - Train R-squared (Accuracy): 0.0000, Test MSE: 85154.5521

Ridge Regression Results:
          Drug Name  Actual Strength  Predicted Strength
33553   Acetostatin            184.0          501.199583
9427    Acetostatin            810.0          498.883480
199      Acetomycin            176.0          496.614505
12447    Dolocillin            539.0          497.236703
39489  Ibuprocillin            369.0          499.553970
Ridge Regression - Train R-squared (Accuracy): 0.0000, Test MSE: 85154.5521

Lasso Regression Results:
          Drug Name  Actual Strength  Predicted Strength
33553   Acetostatin     

# Decision Tree Regression (Model with Highest Accuracy)

In [18]:
import joblib
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, y_train)

# Make predictions on the test set
y_test_pred = decision_tree_model.predict(X_test)

# Calculate metrics
train_r2_score = r2_score(y_train, decision_tree_model.predict(X_train))
test_mse = mean_squared_error(y_test, y_test_pred)

# Display the index, drug name, and predictions
results = pd.DataFrame({
    'Drug Name': drug_names_test,
    'Actual Strength': y_test,
    'Predicted Strength': y_test_pred
})
print(results.head())  # Show the first few predictions

# Log the metrics
print(f'Decision Tree Regression - Train R-squared (Accuracy): {train_r2_score:.4f}')
print(f'Decision Tree Regression - Test MSE: {test_mse:.4f}')
joblib.dump(decision_tree_model, "drug_strength_model_dt.joblib")
joblib.dump(label_encoders, 'label_encoders.joblib')

          Drug Name  Actual Strength  Predicted Strength
33553   Acetostatin            184.0               742.0
9427    Acetostatin            810.0               331.0
199      Acetomycin            176.0               264.0
12447    Dolocillin            539.0               138.0
39489  Ibuprocillin            369.0               432.0
Decision Tree Regression - Train R-squared (Accuracy): 0.7462
Decision Tree Regression - Test MSE: 162531.2329


['label_encoders.joblib']

# Hyper-Parameter Tune Up

In [13]:
# from sklearn.model_selection import GridSearchCV

# # Define the hyperparameters to tune
# param_grid = {
#     'max_depth': [5, 10, 15, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2', None],
#     'max_leaf_nodes': [None, 10, 20, 30]
# }

# # Initialize the DecisionTreeRegressor
# decision_tree_model = DecisionTreeRegressor()

# # Use GridSearchCV to search for the best hyperparameters
# grid_search = GridSearchCV(estimator=decision_tree_model, param_grid=param_grid, cv=5, scoring='r2', verbose=1)

# # Fit the model on the training data
# grid_search.fit(X_train, y_train)

# # Get the best model based on the grid search
# best_decision_tree_model = grid_search.best_estimator_

# # Print the best hyperparameters
# print(f"Best hyperparameters: {grid_search.best_params_}")

# # Make predictions using the best model
# y_test_pred = best_decision_tree_model.predict(X_test)

# # Calculate metrics
# train_r2_score = r2_score(y_train, best_decision_tree_model.predict(X_train))
# test_mse = mean_squared_error(y_test, y_test_pred)

# # Display the index, drug name, and predictions
# results = pd.DataFrame({
#     'Drug Name': drug_names_test,
#     'Actual Strength': y_test,
#     'Predicted Strength': y_test_pred
# })
# print(results.head())  # Show the first few predictions



# # Export the fine-tuned model
# joblib.dump(best_decision_tree_model, "drug_strength_model_dt_tuned.joblib")
# print("Fine-tuned Decision Tree model saved as 'drug_strength_model_dt_tuned.joblib'")


In [14]:
# Log the metrics
# print(f'Fine-Tuned Decision Tree Regression - Train R-squared (Accuracy): {train_r2_score:.4f}')
# print(f'Fine-Tuned Decision Tree Regression - Test MSE: {test_mse:.4f}')

# Testing

In [17]:
def predict_drug_strength(drug_name, category, dosage_form, indication, classification):
    # Create a dictionary for the input
    input_data = {
        'Name': drug_name,
        'Category': category,
        'Dosage Form': dosage_form,
        'Indication': indication,
        'Classification': classification
    }

    # Encode the input data using the label encoders
    for col in input_data:
        if col in label_encoders:
            input_data[col] = label_encoders[col].transform([input_data[col]])[0]

    # Convert the input into a DataFrame to match the model's expected input format
    input_df = pd.DataFrame([input_data])

    # Predict the strength using the loaded Decision Tree model
    predicted_strength = decision_tree_model.predict(input_df)[0]

    return predicted_strength

# Usage
predicted_strength = predict_drug_strength(
    drug_name='Ibuprocillin',
    category='Analgesic',
    dosage_form='Tablet',
    indication='Pain',
    classification='Prescription'
)

print(f"Predicted Strength (mg): {predicted_strength:.2f}")


Predicted Strength (mg): 823.00
