In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load your data
data = pd.read_csv('/Users/soumyadeepchatterjee/Desktop/WayneState/Winter24/CSC-5800-Intelligent-Systems/Final Project/Drugs datasheet merged (1).csv')

In [None]:
data.isna().sum()


drug_name                0
medical_condition        0
side_effects          1152
generic_name          1071
drug_classes          1110
brand_names           2241
activity                 0
rx_otc                   1
pregnancy_category     249
csa                      0
alcohol               1968
related_drugs         2497
rating                1842
no_of_reviews         1842
dtype: int64

In [None]:
data['side_effects'].fillna('No side effects reported', inplace=True)
data['generic_name'].fillna('Unknown', inplace=True)
data['drug_classes'].fillna('Unknown', inplace=True)
data['brand_names'].fillna('No brand names listed', inplace=True)
data['pregnancy_category'].fillna('Not Classified', inplace=True)
data['alcohol'].fillna('No information', inplace=True)
data['related_drugs'].fillna('No related drugs listed', inplace=True)
data['rating'].fillna(0, inplace=True)
data['no_of_reviews'].fillna(0, inplace=True)

# Show the information of the dataset after filling missing values to confirm the changes
data.info()

# Remove the percentage sign and convert to float
data['activity'] = data['activity'].str.replace('%', '').astype(float)

# Confirm the data type change
data['activity'].dtype

# Convert the side effects to lowercase and then convert them to a list format by splitting at commas
data['side_effects'] = data['side_effects'].str.lower().str.split(';')

# Display the first few entries to verify the changes
data['side_effects'].head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3959 entries, 0 to 3958
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   drug_name           3959 non-null   object 
 1   medical_condition   3959 non-null   object 
 2   side_effects        3959 non-null   object 
 3   generic_name        3959 non-null   object 
 4   drug_classes        3959 non-null   object 
 5   brand_names         3959 non-null   object 
 6   activity            3959 non-null   object 
 7   rx_otc              3958 non-null   object 
 8   pregnancy_category  3959 non-null   object 
 9   csa                 3959 non-null   object 
 10  alcohol             3959 non-null   object 
 11  related_drugs       3959 non-null   object 
 12  rating              3959 non-null   float64
 13  no_of_reviews       3959 non-null   float64
dtypes: float64(2), object(12)
memory usage: 433.1+ KB


0    [(hives, difficult breathing, swelling in your...
1    [hives ,  difficulty breathing,  swelling of y...
2    [skin rash, fever, swollen glands, flu-like sy...
3    [problems with your vision or hearing,  muscle...
4    [hives ,  difficult breathing,  swelling of yo...
Name: side_effects, dtype: object

In [None]:
# Ensure all entries in 'side_effects' are strings
data['side_effects'] = data['side_effects'].astype(str).str.lower().str.split(';')

# Convert the side effects to lowercase and then convert them to a list format by splitting at commas
# data['side_effects'] = data['side_effects'].str.lower().str.split(';')

# Display the first few entries to verify the changes
data['side_effects'].head()

0    [['(hives, difficult breathing, swelling in yo...
1    [['hives ', ' difficulty breathing', ' swellin...
2    [['skin rash, fever, swollen glands, flu-like ...
3    [['problems with your vision or hearing', ' mu...
4    [['hives ', ' difficult breathing', ' swelling...
Name: side_effects, dtype: object

In [None]:
import re

# To analyze the most common side effects, we first need to flatten the list of side effects into a single list
from itertools import chain

# Flatten the list of side effects
all_side_effects = list(chain.from_iterable(data['side_effects'].dropna()))

# Remove leading and trailing whitespace
all_side_effects = [effect.strip() for effect in all_side_effects]

# Convert to a pandas Series and count occurrences of each side effect
side_effect_counts = pd.Series(all_side_effects).value_counts()

# Show the top 10 most common side effects
top_10_side_effects = side_effect_counts.head(10)
top_10_side_effects


# Correcting normalization rules and reapplying

# Updated normalization function
def normalize_text_corrected(text):
    text = text.lower().strip()
    text = re.sub(r'difficulty|difficult', 'difficult', text)
    text = re.sub(r'\(hives\)|hives|hive', 'hives', text)
    text = re.sub(r'wheezing|wheeze', 'wheezing', text)
    text = re.sub(r'breathing|breathe', 'breathing', text)
    return text

# Reapply normalization with corrected rules
normalized_effects_corrected = [normalize_text_corrected(effect) for effect in all_side_effects]

# Convert to a pandas Series and count occurrences of each normalized side effect
normalized_side_effect_counts_corrected = pd.Series(normalized_effects_corrected).value_counts()

# Show the top 10 most common normalized side effects
normalized_top_10_side_effects_corrected = normalized_side_effect_counts_corrected.head(10)
normalized_top_10_side_effects_corrected

['no side effects reported']                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            1152
['hives ', ' difficult breathing', ' swelling of your face, lips, tongue, or throat. this medicine may cause serious side effects. stop using this medicine and call your doctor at once if you have: redness or swelling of the treated area', ' increased pain', ' or severe burning or skin irritation such as a rash, itching, pain, or blistering. less serious side effects may be mo

In [None]:
data.isna().sum()

drug_name             0
medical_condition     0
side_effects          0
generic_name          0
drug_classes          0
brand_names           0
activity              0
rx_otc                0
pregnancy_category    0
csa                   0
alcohol               0
related_drugs         0
rating                0
no_of_reviews         0
dtype: int64

In [None]:
missing_rx_otc_row = data[data['rx_otc'].isna()]


In [None]:
missing_rx_otc_row

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity,rx_otc,pregnancy_category,csa,alcohol,related_drugs,rating,no_of_reviews
1617,Zegerid with Magnesium Hydroxide,GERD (Heartburn),[['no side effects reported']],"omeprazole, sodium bicarbonate and magnesium h...",Unknown,No brand names listed,0.0,,C,U,No information,omeprazole: https://www.drugs.com/omeprazole.h...,0.0,0.0


In [None]:
data['rx_otc'] = data['rx_otc'].fillna('Rx')


In [None]:
# Calculate the number of side effects for each drug
data['num_side_effects'] = data['side_effects'].apply(len)

# Calculate correlations of num_side_effects with activity and rating
correlation_with_activity = data['num_side_effects'].corr(data['activity'])
correlation_with_rating = data['num_side_effects'].corr(data['rating'])

correlation_with_activity, correlation_with_rating


X = data[['num_side_effects', 'activity', 'csa']]
X = pd.get_dummies(X, columns=['csa'], drop_first=True)
y = data['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

dt_model = DecisionTreeRegressor(max_depth=4, random_state=0)
dt_model.fit(X_train, y_train)


y_pred = dt_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}, R^2: {r2}')

from sklearn import tree
import matplotlib.pyplot as plt

plt.figure(figsize=(35, 20), dpi=100)

tree.plot_tree(dt_model, filled=True,
               feature_names=X.columns,
               class_names=True,
               proportion=False,
               rounded=True,
               fontsize=10)
plt.savefig('decision_tree_high_res.png', format='png', bbox_inches='tight', dpi=300)

plt.show()

In [None]:
data.head()

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity,rx_otc,pregnancy_category,csa,alcohol,related_drugs,rating,no_of_reviews
0,doxycycline,Acne,"[['(hives, difficult breathing, swelling in yo...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87.0,Rx,D,N,X,amoxicillin: https://www.drugs.com/amoxicillin...,6.8,760.0
1,spironolactone,Acne,"[['hives ', ' difficulty breathing', ' swellin...",spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82.0,Rx,C,N,X,amlodipine: https://www.drugs.com/amlodipine.h...,7.2,449.0
2,minocycline,Acne,"[['skin rash, fever, swollen glands, flu-like ...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48.0,Rx,D,N,No information,amoxicillin: https://www.drugs.com/amoxicillin...,5.7,482.0
3,Accutane,Acne,"[['problems with your vision or hearing', ' mu...",isotretinoin (oral),"Miscellaneous antineoplastics, Miscellaneous u...",No brand names listed,41.0,Rx,X,N,X,doxycycline: https://www.drugs.com/doxycycline...,7.9,623.0
4,clindamycin,Acne,"[['hives ', ' difficult breathing', ' swelling...",clindamycin topical,"Topical acne agents, Vaginal anti-infectives","Cleocin T, Clindacin ETZ, Clindacin P, Clindag...",39.0,Rx,B,N,No information,doxycycline: https://www.drugs.com/doxycycline...,7.4,146.0
