In [139]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, RobustScaler, PowerTransformer
import re
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [140]:
df = pd.read_csv('drugsdata.csv')
df.head()

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity,rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87%,Rx,D,N,X,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,6.8,760.0,https://www.drugs.com/doxycycline.html,https://www.drugs.com/condition/acne.html
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82%,Rx,C,N,X,amlodipine: https://www.drugs.com/amlodipine.h...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.2,449.0,https://www.drugs.com/spironolactone.html,https://www.drugs.com/condition/acne.html
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48%,Rx,D,N,,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,5.7,482.0,https://www.drugs.com/minocycline.html,https://www.drugs.com/condition/acne.html
3,Accutane,Acne,problems with your vision or hearing; muscle o...,isotretinoin (oral),"Miscellaneous antineoplastics, Miscellaneous u...",,41%,Rx,X,N,X,doxycycline: https://www.drugs.com/doxycycline...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.9,623.0,https://www.drugs.com/accutane.html,https://www.drugs.com/condition/acne.html
4,clindamycin,Acne,hives ; difficult breathing; swelling of your ...,clindamycin topical,"Topical acne agents, Vaginal anti-infectives","Cleocin T, Clindacin ETZ, Clindacin P, Clindag...",39%,Rx,B,N,,doxycycline: https://www.drugs.com/doxycycline...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.4,146.0,https://www.drugs.com/mtm/clindamycin-topical....,https://www.drugs.com/condition/acne.html


In [141]:
df = df.drop(columns=['related_drugs', 'drug_link', 'medical_condition_url', 'brand_names', 'medical_condition_description', 'generic_name'])

In [142]:
df.head()

Unnamed: 0,drug_name,medical_condition,side_effects,drug_classes,activity,rx_otc,pregnancy_category,csa,alcohol,rating,no_of_reviews
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...","Miscellaneous antimalarials, Tetracyclines",87%,Rx,D,N,X,6.8,760.0
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,"Aldosterone receptor antagonists, Potassium-sp...",82%,Rx,C,N,X,7.2,449.0
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",Tetracyclines,48%,Rx,D,N,,5.7,482.0
3,Accutane,Acne,problems with your vision or hearing; muscle o...,"Miscellaneous antineoplastics, Miscellaneous u...",41%,Rx,X,N,X,7.9,623.0
4,clindamycin,Acne,hives ; difficult breathing; swelling of your ...,"Topical acne agents, Vaginal anti-infectives",39%,Rx,B,N,,7.4,146.0


In [143]:
def get_top_3_side_effects(side_effects_str):
    if pd.isna(side_effects_str) or side_effects_str == '':
        return ['', '', '']
    
    effects = re.split(r'[;,.\n]', str(side_effects_str))
    effects = [effect.strip().lower() for effect in effects if effect.strip()]
    
    top_3 = effects[:3]
    while len(top_3) < 3:
        top_3.append('')
    
    return top_3

def split_drug_classes(classes_str):
    if pd.isna(classes_str) or classes_str == '':
        return ['', '', '']
    
    classes = [c.strip() for c in str(classes_str).split(',')]
    classes = classes[:3]
    while len(classes) < 3:
        classes.append('')
    
    return classes

df['top_3_side_effects'] = df['side_effects'].apply(get_top_3_side_effects)
df['drug_class_1'] = df['drug_classes'].apply(lambda x: split_drug_classes(x)[0])
df['drug_class_2'] = df['drug_classes'].apply(lambda x: split_drug_classes(x)[1])
df['drug_class_3'] = df['drug_classes'].apply(lambda x: split_drug_classes(x)[2])

df['side_effect_1'] = df['top_3_side_effects'].apply(lambda x: x[0])
df['side_effect_2'] = df['top_3_side_effects'].apply(lambda x: x[1])
df['side_effect_3'] = df['top_3_side_effects'].apply(lambda x: x[2])

le_medical = LabelEncoder()
le_drug_class_1 = LabelEncoder()
le_drug_class_2 = LabelEncoder()
le_drug_class_3 = LabelEncoder()
le_side_effect_1 = LabelEncoder()
le_side_effect_2 = LabelEncoder()
le_side_effect_3 = LabelEncoder()

df['medical_condition_encoded'] = le_medical.fit_transform(df['medical_condition'])
df['drug_class_1_encoded'] = le_drug_class_1.fit_transform(df['drug_class_1'])
df['drug_class_2_encoded'] = le_drug_class_2.fit_transform(df['drug_class_2'])
df['drug_class_3_encoded'] = le_drug_class_3.fit_transform(df['drug_class_3'])
df['side_effect_1_encoded'] = le_side_effect_1.fit_transform(df['side_effect_1'])
df['side_effect_2_encoded'] = le_side_effect_2.fit_transform(df['side_effect_2'])
df['side_effect_3_encoded'] = le_side_effect_3.fit_transform(df['side_effect_3'])

le_rx_otc = LabelEncoder()
le_pregnancy_category = LabelEncoder()
le_csa = LabelEncoder()
le_alchohol = LabelEncoder()

df['rx_otc_encoded'] = le_rx_otc.fit_transform(df['rx_otc'])
df['pregnancy_category_encoded'] = le_pregnancy_category.fit_transform(df['pregnancy_category'])
df['csa_encoded'] = le_csa.fit_transform(df['csa'])
df['alcohol_encoded'] = le_alchohol.fit_transform(df['alcohol'])

final_features = df[[
    'medical_condition_encoded',
    'drug_class_1_encoded',
    'drug_class_2_encoded', 
    'drug_class_3_encoded',
    'side_effect_1_encoded',
    'side_effect_2_encoded',
    'side_effect_3_encoded',
    'activity',
    'rating',
    'no_of_reviews',
    'drug_name',
    'rx_otc_encoded',
    'pregnancy_category_encoded',
    'csa_encoded',
    'alcohol_encoded'
]].copy()


In [144]:
# Convert to decimal (0.87 format)
final_features['activity'] = final_features['activity'].str.rstrip('%').astype(float) / 100

final_features.head()

Unnamed: 0,medical_condition_encoded,drug_class_1_encoded,drug_class_2_encoded,drug_class_3_encoded,side_effect_1_encoded,side_effect_2_encoded,side_effect_3_encoded,activity,rating,no_of_reviews,drug_name,rx_otc_encoded,pregnancy_category_encoded,csa_encoded,alcohol_encoded
0,2,131,46,0,2,28,165,0.87,6.8,760.0,doxycycline,1,3,5,0
1,2,9,35,0,75,29,171,0.82,7.2,449.0,spironolactone,1,2,5,0
2,2,198,0,0,170,38,173,0.48,5.7,482.0,minocycline,1,3,5,1
3,2,132,25,0,129,89,13,0.41,7.9,623.0,Accutane,1,5,5,0
4,2,206,51,0,75,28,171,0.39,7.4,146.0,clindamycin,1,1,5,1


In [145]:
encoded_categorical_cols = [
    'medical_condition_encoded',
    'drug_class_1_encoded', 
    'drug_class_2_encoded',
    'drug_class_3_encoded',
    'side_effect_1_encoded',
    'side_effect_2_encoded',
    'side_effect_3_encoded',
    'acitivty',
    'rating',
    'no_of_reviews',
    'rx_otc_encoded',
    'pregnancy_category_encoded',
    'csa_encoded',
    'alcohol_encoded'
]

for col in encoded_categorical_cols:
    if col in final_features.columns:
        final_features[col] = final_features[col].fillna(-1)

numerical_cols = ['activity', 'rating', 'no_of_reviews']

for col in numerical_cols:
    if col in final_features.columns:
        if col == 'activity':
            median_val = final_features[col].median()
            final_features[col] = final_features[col].fillna(median_val)
        elif col == 'rating':
            final_features[col] = final_features[col].fillna(0)
        elif col == 'no_of_reviews':
            final_features[col] = final_features[col].fillna(0)

if 'drug_name' in final_features.columns:
    final_features['drug_name'] = final_features['drug_name'].fillna('Unknown')


In [146]:
final_features.head()

Unnamed: 0,medical_condition_encoded,drug_class_1_encoded,drug_class_2_encoded,drug_class_3_encoded,side_effect_1_encoded,side_effect_2_encoded,side_effect_3_encoded,activity,rating,no_of_reviews,drug_name,rx_otc_encoded,pregnancy_category_encoded,csa_encoded,alcohol_encoded
0,2,131,46,0,2,28,165,0.87,6.8,760.0,doxycycline,1,3,5,0
1,2,9,35,0,75,29,171,0.82,7.2,449.0,spironolactone,1,2,5,0
2,2,198,0,0,170,38,173,0.48,5.7,482.0,minocycline,1,3,5,1
3,2,132,25,0,129,89,13,0.41,7.9,623.0,Accutane,1,5,5,0
4,2,206,51,0,75,28,171,0.39,7.4,146.0,clindamycin,1,1,5,1


In [147]:
numerical_columns = ['activity', 'rating', 'no_of_reviews']
numerical_columns = [col for col in numerical_columns if col in final_features.columns]

encoded_columns = [col for col in final_features.columns if 'encoded' in col]

text_columns = ['drug_name']

skewed_columns = numerical_columns + encoded_columns

skewed_and_scaled_transformer = Pipeline([
    ('power_transform', PowerTransformer(method='yeo-johnson')),
    ('robust_scale', RobustScaler())
])

scaled_only_transformer = RobustScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ('skewed_and_scaled', skewed_and_scaled_transformer, skewed_columns),
        ('text_features', 'passthrough', text_columns)
    ],
    remainder='drop'
)

preprocessor.fit(final_features)
final_features_processed = preprocessor.transform(final_features)

feature_names = preprocessor.get_feature_names_out()
final_features_processed_df = pd.DataFrame(final_features_processed,
                                           columns=feature_names,
                                           index=final_features.index)



In [148]:
final_features_processed_df.head()

Unnamed: 0,skewed_and_scaled__activity,skewed_and_scaled__rating,skewed_and_scaled__no_of_reviews,skewed_and_scaled__medical_condition_encoded,skewed_and_scaled__drug_class_1_encoded,skewed_and_scaled__drug_class_2_encoded,skewed_and_scaled__drug_class_3_encoded,skewed_and_scaled__side_effect_1_encoded,skewed_and_scaled__side_effect_2_encoded,skewed_and_scaled__side_effect_3_encoded,skewed_and_scaled__rx_otc_encoded,skewed_and_scaled__pregnancy_category_encoded,skewed_and_scaled__csa_encoded,skewed_and_scaled__alcohol_encoded,text_features__drug_name
0,1.491112,0.251378,1.011458,-0.923105,-0.019204,3.276138,0.0,-2.020069,0.0,-0.068058,0.0,0.575622,0.0,-1.0,doxycycline
1,1.490553,0.27504,0.946841,-0.923105,-1.00619,3.276084,0.0,0.0,0.029603,0.0,0.0,0.0,0.0,-1.0,spironolactone
2,1.470653,0.182011,0.95577,-0.923105,0.38624,0.0,0.0,1.28055,0.269119,0.022722,0.0,0.575622,0.0,0.0,minocycline
3,1.454555,0.314728,0.987497,-0.923105,-0.012789,3.275929,0.0,0.787321,1.170656,-1.70405,0.0,1.333047,0.0,-1.0,Accutane
4,1.44786,0.286594,0.795743,-0.923105,0.43192,3.276149,0.0,0.0,0.0,0.0,0.0,-0.869228,0.0,0.0,clindamycin


In [149]:
def filter_drugs_by_condition(df, condition):
    return df[df['medical_condition'].str.contains(condition, case=False, na=False)]


In [153]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

def calculate_feature_weights(df, target):
    # Select all numeric columns except the target
    features = [col for col in df.select_dtypes(include=['number']).columns if col != target]
    X = df[features]
    y = df[target]
    
    # Check for missing values in the target
    if y.isna().sum() > 0:
        # Option 1: Remove rows with missing target values
        valid_indices = y.notna()
        X = X[valid_indices]
        y = y[valid_indices]
    
    # Check for missing values in features
    if X.isna().any().any():
        X = X.fillna(X.mean())
    
    # Initialize and fit the model
    model = RandomForestRegressor(random_state=42)
    model.fit(X, y)
    
    # Calculate feature importances
    importances = dict(zip(features, model.feature_importances_))
    total = sum(importances.values())
    weights = {f: imp / total for f, imp in importances.items()}
    return weights, features

# Example usage
weights, features = calculate_feature_weights(df, target="rating")

In [154]:
def score_drug(row, weights, features, invert_features=None):
    if invert_features is None:
        invert_features = ["side_effects", "cost"]

    score = 0.0
    for feature in features:
        if feature in weights:
            if feature in invert_features:
                # invert score for lower-is-better features (normalize 0-1 first if needed)
                score += weights[feature] * (1 - row[feature])
            else:
                score += weights[feature] * row[feature]
    return score

In [155]:
def recommend_drugs(df, condition, weights, features, top_n=5):
    filtered = filter_drugs_by_condition(df, condition)
    if filtered.empty:
        return pd.DataFrame(columns=["drug_name", "score"])
    
    filtered["score"] = filtered.apply(lambda row: score_drug(row, weights, features), axis=1)
    ranked = filtered.sort_values(by="score", ascending=False)
    return ranked[["drug_name", "score"]].head(top_n)


In [156]:
top_drugs = recommend_drugs(df, "Acne", weights, features, top_n=5)
print("Top recommended drugs:")
print(top_drugs)

Top recommended drugs:
                         drug_name       score
7                     isotretinoin  244.870979
47    adapalene / benzoyl peroxide  243.150280
13                          Epiduo  200.806539
0                      doxycycline  194.300971
23  benzoyl peroxide / clindamycin  178.558118


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["score"] = filtered.apply(lambda row: score_drug(row, weights, features), axis=1)


In [157]:
drug_score_map = dict(zip(top_drugs["drug_name"], top_drugs["score"]))
print("Drug → Score mapping:", drug_score_map)

Drug → Score mapping: {'isotretinoin': 244.87097918835195, 'adapalene / benzoyl peroxide': 243.1502803249352, 'Epiduo': 200.80653882050564, 'doxycycline': 194.3009707985718, 'benzoyl peroxide / clindamycin': 178.55811766611467}
