In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv("/content/cleaned-data.csv")

In [None]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2538 entries, 0 to 2737
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   drug_name                      2538 non-null   object 
 1   medical_condition              2538 non-null   object 
 2   side_effects                   2538 non-null   object 
 3   generic_name                   2538 non-null   object 
 4   drug_classes                   2538 non-null   object 
 5   activity                       2538 non-null   float64
 6   rx_otc                         2538 non-null   object 
 7   pregnancy_category             2538 non-null   object 
 8   csa                            2538 non-null   object 
 9   medical_condition_description  2538 non-null   object 
dtypes: float64(1), object(9)
memory usage: 218.1+ KB


In [None]:
import string

# Function to remove punctuation and convert text to lowercase
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text
for column in ['medical_condition', 'generic_name', 'rx_otc', 'pregnancy_category', 'csa', 'side_effects', 'medical_condition_description']:
  df[column] = df[column].apply(preprocess_text)

# # Apply the text preprocessing to 'side_effects' and 'medical_condition_description'
# df['side_effects'] = df['side_effects'].apply(preprocess_text)
# df['medical_condition_description'] = df['medical_condition_description'].apply(preprocess_text)

In [None]:
df['medical_condition_description'] = df['medical_condition_description'].str.replace(
    r'other name', '', case=False, regex=True)

In [None]:
df.head(3)

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,activity,rx_otc,pregnancy_category,csa,medical_condition_description
0,doxycycline,acne,hives difficult breathing swelling in your fac...,doxycycline,"Miscellaneous antimalarials, Tetracyclines",0.87,rx,d,n,acne s acne vulgaris blackheads breakouts cyst...
1,spironolactone,acne,hives difficulty breathing swelling of your f...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...",0.82,rx,c,n,acne s acne vulgaris blackheads breakouts cyst...
2,minocycline,acne,skin rash fever swollen glands flulike symptom...,minocycline,Tetracyclines,0.48,rx,d,n,acne s acne vulgaris blackheads breakouts cyst...


In [None]:

# OneHotEncode categorical variables
categorical_columns = ['medical_condition', 'generic_name', 'rx_otc', 'pregnancy_category', 'csa']
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
df_encoded_categorical = onehot_encoder.fit_transform(df[categorical_columns])

# TF-IDF Vectorization for text data
tfidf_side = TfidfVectorizer(max_features=500)
df_tfidf_side_effects = tfidf_side.fit_transform(df['side_effects']).toarray()
tfidf_condition = TfidfVectorizer(max_features=500)
df_tfidf_medical_condition_desc = tfidf_condition.fit_transform(df['medical_condition_description']).toarray()

# Combine all features
df_features = np.hstack([df_encoded_categorical, df_tfidf_side_effects, df_tfidf_medical_condition_desc])

# Split the data into training and testing sets
X = df_features
y_activity = df['activity']

In [None]:
X_train, X_test, y_train_activity, y_test_activity = train_test_split(X, y_activity, test_size=0.2, random_state=42)

In [None]:
# Train a Random Forest Regressor for 'activity'
rf_activity = RandomForestRegressor(n_estimators=200, random_state=42)
rf_activity.fit(X_train, y_train_activity)

In [None]:
# Predict on the test set
y_pred_activity = rf_activity.predict(X_test)

In [None]:
# Evaluate Activity Model
mse_activity = mean_squared_error(y_test_activity, y_pred_activity)
r2_activity = r2_score(y_test_activity, y_pred_activity)

(mse_activity, r2_activity)
print(f"Random forest Model - MSE: {mse_activity}, R2: {r2_activity}")


Random forest Model - MSE: 0.02052975561266791, R2: 0.28278418024567187


# XGBoost

In [None]:
# Train an XGBoost Regressor for 'activity'
xgb_activity = xgb.XGBRegressor(objective='reg:squarederror', learning_rate= 0.01, max_depth= 7, n_estimators= 300, random_state=42)
xgb_activity.fit(X_train, y_train_activity)

In [None]:
# Predict on the test set
y_pred_activity_xgb = xgb_activity.predict(X_test)

In [None]:
# Evaluate XGBoost Activity Model
mse_activity_xgb = mean_squared_error(y_test_activity, y_pred_activity_xgb)
r2_activity_xgb = r2_score(y_test_activity, y_pred_activity_xgb)
print(f"XGBoost Activity Model - MSE: {mse_activity_xgb}, R2: {r2_activity_xgb}")

XGBoost Activity Model - MSE: 0.021110199162950583, R2: 0.26250613580172255


In [None]:
import pickle
pickle.dump(onehot_encoder, open('onehot_encoder.pkl','wb'))
pickle.dump(tfidf_side, open('tfidf_side.pkl','wb'))
pickle.dump(tfidf_condition, open('tfidf_condition.pkl','wb'))
pickle.dump(rf_activity, open('rf_activity.pkl','wb'))
pickle.dump(xgb_activity, open('xgb_activity.pkl','wb'))

# GUI App

In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
onehot_encoder =pickle.load(open('onehot_encoder.pkl','rb'))
tfidf_side = pickle.load(open('tfidf_side.pkl','rb'))
tfidf_condition = pickle.load(open('tfidf_condition.pkl','rb'))
rf_activity = pickle.load(open('rf_activity.pkl','rb'))
xgb_activity= pickle.load(open('xgb_activity.pkl','rb'))



In [None]:
import string

# Function to remove punctuation and convert text to lowercase
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

In [None]:
categorical_columns = ['medical_condition', 'generic_name', 'rx_otc', 'pregnancy_category', 'csa']

In [None]:
# Function to preprocess user input and predict activity
def predict_activity(user_input, model_type='random_forest'):
    # Preprocess categorical inputs
    user_categorical = [user_input[col] for col in categorical_columns]
    user_categorical_encoded = onehot_encoder.transform([user_categorical])

    # Preprocess text inputs
    user_side_effects = preprocess_text(user_input['side_effects'])
    user_medical_condition_desc = preprocess_text(user_input['medical_condition_description'])

    # Vectorize text inputs
    user_tfidf_side_effects = tfidf_side.transform([user_side_effects]).toarray()
    user_tfidf_medical_condition_desc = tfidf_condition.transform([user_medical_condition_desc]).toarray()

    # Combine all features
    user_features = np.hstack([user_categorical_encoded, user_tfidf_side_effects, user_tfidf_medical_condition_desc])

    # Predict activity based on selected model
    if model_type == 'random_forest':
        prediction = rf_activity.predict(user_features)
    elif model_type == 'xgboost':
        prediction = xgb_activity.predict(user_features)
    else:
        raise ValueError("Invalid model type. Choose 'random_forest' or 'xgboost'.")

    return prediction[0]



In [None]:
# Example usage of the prediction function
user_input = {
    'generic_name': 'doxycycline',
    'medical_condition': 'Acne',
    'rx_otc': 'Rx',
    'pregnancy_category': 'D',
    'csa': 'N',
    'side_effects': 'hives, difficult breathing, swelling',
    'medical_condition_description': 'Acne Vulgaris, Blackheads, '
}

# Predict using Random Forest
predicted_activity_rf = predict_activity(user_input, model_type='random_forest')
print(f"Predicted Activity (Random Forest): {predicted_activity_rf}")

# Predict using XGBoost
predicted_activity_xgb = predict_activity(user_input, model_type='xgboost')
print(f"Predicted Activity (XGBoost): {predicted_activity_xgb}")

Predicted Activity (Random Forest): 0.1754963095238096
Predicted Activity (XGBoost): 0.1300130933523178


