In [200]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, RobustScaler, PowerTransformer, MinMaxScaler
import re
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [201]:
df = pd.read_csv('drugsdata.csv')
df.head()

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity,rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87%,Rx,D,N,X,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,6.8,760.0,https://www.drugs.com/doxycycline.html,https://www.drugs.com/condition/acne.html
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82%,Rx,C,N,X,amlodipine: https://www.drugs.com/amlodipine.h...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.2,449.0,https://www.drugs.com/spironolactone.html,https://www.drugs.com/condition/acne.html
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48%,Rx,D,N,,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,5.7,482.0,https://www.drugs.com/minocycline.html,https://www.drugs.com/condition/acne.html
3,Accutane,Acne,problems with your vision or hearing; muscle o...,isotretinoin (oral),"Miscellaneous antineoplastics, Miscellaneous u...",,41%,Rx,X,N,X,doxycycline: https://www.drugs.com/doxycycline...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.9,623.0,https://www.drugs.com/accutane.html,https://www.drugs.com/condition/acne.html
4,clindamycin,Acne,hives ; difficult breathing; swelling of your ...,clindamycin topical,"Topical acne agents, Vaginal anti-infectives","Cleocin T, Clindacin ETZ, Clindacin P, Clindag...",39%,Rx,B,N,,doxycycline: https://www.drugs.com/doxycycline...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.4,146.0,https://www.drugs.com/mtm/clindamycin-topical....,https://www.drugs.com/condition/acne.html


In [202]:
df = df.drop(columns=['related_drugs', 'drug_link', 'medical_condition_url', 'brand_names', 'medical_condition_description', 'generic_name'])

In [203]:
df['activity'] = df['activity'].str.rstrip('%').astype(float) / 100

In [204]:
df['no_of_reviews'] = df['no_of_reviews'].fillna(0)
df['rating'] = df['rating'].fillna(df['rating'].mean())
df['alcohol'] = df['alcohol'].fillna('Unknown')

In [205]:
def count_side_effects(side_effects_str):
    if pd.isna(side_effects_str) or side_effects_str == '':
        return 0
    effects = re.split(r'[;,.\\n]', str(side_effects_str))
    effects = [effect.strip().lower() for effect in effects if effect.strip()]
    return len(effects)

In [206]:
def count_drug_classes(classes_str):
    if pd.isna(classes_str) or classes_str == '':
        return 0
    classes = [c.strip() for c in classes_str.split(',') if c.strip()]
    return len(classes)

In [207]:
df['num_side_effects'] = df['side_effects'].apply(count_side_effects)
df['num_drug_classes'] = df['drug_classes'].apply(count_drug_classes)

In [208]:
df = df.drop(columns=['side_effects', 'drug_classes'])

In [209]:
categorical_features = ['rx_otc', 'pregnancy_category', 'csa', 'alcohol']
le_dict = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

In [210]:
le_condition = LabelEncoder()
df['encoded_medical_condition'] = le_condition.fit_transform(df['medical_condition'].astype(str))


In [211]:
numerical_features = ['activity', 'no_of_reviews', 'num_side_effects', 'num_drug_classes']

numeric_transformer = Pipeline(steps=[
    ('power', PowerTransformer(method='yeo-johnson')), 
    ('scaler', RobustScaler()) 
])

In [212]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features)
    ], remainder='passthrough'  # Keep encoded categoricals as is
)

# Prepare features and target
all_features = numerical_features + categorical_features + ['encoded_medical_condition']
X = df[all_features]
y = df['rating']

In [213]:
processed_X = preprocessor.fit_transform(X)
processed_columns = numerical_features + categorical_features + ['encoded_medical_condition']
final_features_processed_df = pd.DataFrame(processed_X, columns=processed_columns, index=df.index)
final_features_processed_df['drug_name'] = df['drug_name']
final_features_processed_df['medical_condition'] = df['medical_condition']
final_features_processed_df['rating'] = y

In [214]:
scaler = MinMaxScaler()
final_features_processed_df[numerical_features] = scaler.fit_transform(final_features_processed_df[numerical_features])

In [215]:
# Normalize categorical features to [0, 1] to ensure scores stay in [0, 1]
categorical_to_scale = categorical_features + ['encoded_medical_condition']
scaler_cat = MinMaxScaler()
final_features_processed_df[categorical_to_scale] = scaler_cat.fit_transform(final_features_processed_df[categorical_to_scale])

In [216]:
def calculate_feature_weights(df, target='rating'):
    features = [col for col in df.select_dtypes(include=['number']).columns if col != target and col not in ['drug_name', 'medical_condition']]
    if not features:
        raise ValueError('No numerical features available for model fitting.')
    
    X = df[features]
    y = df[target]
    
    X = X.fillna(X.mean())
    valid_indices = y.notna()
    X = X[valid_indices]
    y = y[valid_indices]
    
    model = RandomForestRegressor(random_state=42)
    model.fit(X, y)
    
    importances = dict(zip(features, model.feature_importances_))
    total = sum(importances.values())
    weights = {f: imp / total for f, imp in importances.items()}
    return weights, features

weights, features = calculate_feature_weights(final_features_processed_df)

In [217]:
def score_drug(row, weights, features, invert_features=None):
    if invert_features is None:
        invert_features = ['num_side_effects']  # Higher side effects are worse

    score = 0.0
    for feature in features:
        if feature in weights:
            value = row[feature]
            if feature in invert_features:
                score += weights[feature] * (1 - value)  # Invert for worse features
            else:
                score += weights[feature] * value
    return score

In [218]:
def filter_drugs_by_condition(df, condition):
    return df[df['medical_condition'] == condition]


In [219]:
def recommend_drugs(df, condition, weights, features, top_n=5):
    filtered = filter_drugs_by_condition(df, condition).copy()  # Create a copy to avoid warnings
    if filtered.empty:
        return pd.DataFrame(columns=['drug_name', 'score'])
    
    filtered.loc[:, 'score'] = filtered.apply(lambda row: score_drug(row, weights, features), axis=1)
    ranked = filtered.sort_values(by='score', ascending=False)
    return ranked[['drug_name', 'score']].head(top_n)

In [228]:
top_drugs = recommend_drugs(final_features_processed_df, 'Angina', weights, features, top_n=5)
print('Top recommended drugs:')
print(top_drugs)

Top recommended drugs:
                  drug_name     score
443                 aspirin  0.751870
446              metoprolol  0.747227
444           nitroglycerin  0.743366
445                  Ranexa  0.730319
447  isosorbide mononitrate  0.717196


In [221]:
drug_score_map = dict(zip(top_drugs["drug_name"], top_drugs["score"]))
print("Drug → Score mapping:", drug_score_map)

Drug → Score mapping: {'Accutane': 0.6722106524711426, 'isotretinoin': 0.6665909733416683, 'spironolactone': 0.6471265229414442, 'doxycycline': 0.64233605142593, 'Bactrim': 0.6049867166771916}
