In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
file_path = "/content/drug1.csv"


In [None]:
df = pd.read_csv(file_path)
print("Initial Dataset Shape:", df.shape)
print("\nInitial Dataset Info:")
print(df.info())

Initial Dataset Shape: (399, 11)

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   drug         399 non-null    object 
 1   dosage       399 non-null    float64
 2   condition    399 non-null    object 
 3   review       399 non-null    object 
 4   rating       399 non-null    int64  
 5   usefulCount  399 non-null    int64  
 6   bp           399 non-null    object 
 7   sugar        399 non-null    object 
 8   temperature  399 non-null    int64  
 9   age          399 non-null    int64  
 10  Sideeffects  399 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 34.4+ KB
None


In [None]:
print("\nFirst 5 Rows:")
display(df.head())


First 5 Rows:


Unnamed: 0,drug,dosage,condition,review,rating,usefulCount,bp,sugar,temperature,age,Sideeffects
0,Mirtazapine,15.0,Depression,"""I&#039;ve tried a few antidepressants over th...",10,22,normal,normal,97,30,dizziness
1,Mesalamine,250.0,Lymphocytic Colitis,"""I am newly diagnosed with LC, I am 38 years o...",8,23,abnormal,abnormal,97,22,headache
2,Bactrim,0.24,Urinary Tract Infection,"""Quick reduction of symptoms""",9,3,abnormal,abnormal,103,56,vomting
3,Contrave,8.0,Weight Loss,"""Contrave combines drugs that were used for al...",9,35,abnormal,abnormal,100,35,vomting and headache
4,LEVORA,0.15,Birth Control,"""I was on this pill for almost two years. It d...",2,3,abnormal,abnormal,97,30,headache and dizziness


In [None]:
print("\nMissing Values Before Imputation:")
print(df.isnull().sum())


Missing Values Before Imputation:
drug           0
dosage         0
condition      0
review         0
rating         0
usefulCount    0
bp             0
sugar          0
temperature    0
age            0
Sideeffects    0
dtype: int64


In [None]:
imputer_categorical = SimpleImputer(strategy='most_frequent')
imputer_numeric = SimpleImputer(strategy='mean')

In [None]:
print("\nMissing Values After Imputation:")
print(df.isnull().sum())


Missing Values After Imputation:
drug           0
dosage         0
condition      0
review         0
rating         0
usefulCount    0
bp             0
sugar          0
temperature    0
age            0
Sideeffects    0
dtype: int64


In [None]:
print("\nEncoding Categorical Columns...")
label_encoder = LabelEncoder()


Encoding Categorical Columns...


In [None]:
print("\nScaling Numeric Features...")
scaler = StandardScaler()


Scaling Numeric Features...


In [None]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns


In [None]:
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

print("Numeric Features Scaled Successfully!")

Numeric Features Scaled Successfully!


In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(text):
      text = re.sub(r'\W', ' ', text)
      text = text.lower()
      text = text.split()
      text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
      return ' '.join(text)

In [None]:
if 'medical_condition_description' in df.columns:
      print("\nCleaning 'medical_condition_description' column...")
      df['medical_condition_description'] = df['medical_condition_description'].apply(clean_text)
      print("Text Preprocessing Complete!")

In [None]:
print("\nSplitting Dataset into Train and Test Sets...")
target_column = 'review'
X = df.drop(columns=[target_column])
y = df[target_column]


Splitting Dataset into Train and Test Sets...


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print("Train-Test Split Complete!")
print("Training Set Shape:", X_train.shape, "Testing Set Shape:", X_test.shape)

Train-Test Split Complete!
Training Set Shape: (319, 10) Testing Set Shape: (80, 10)


In [None]:
print("\nSaving Preprocessed Data...")
processed_file_path = "/content/processed_drug_dataset.csv"
df.to_csv(processed_file_path, index=False)
print(f"Preprocessed dataset saved at: {processed_file_path}")


Saving Preprocessed Data...
Preprocessed dataset saved at: /content/processed_drug_dataset.csv


In [None]:
print("\nFinal Processed Dataset:")
display(df.head())


Final Processed Dataset:


Unnamed: 0,drug,dosage,condition,review,rating,usefulCount,bp,sugar,temperature,age,Sideeffects
0,Mirtazapine,-0.267876,Depression,"""I&#039;ve tried a few antidepressants over th...",0.886904,0.023024,normal,normal,-0.83493,-0.774044,dizziness
1,Mesalamine,0.713186,Lymphocytic Colitis,"""I am newly diagnosed with LC, I am 38 years o...",0.277301,0.06146,abnormal,abnormal,-0.83493,-1.189017,headache
2,Bactrim,-0.329495,Urinary Tract Infection,"""Quick reduction of symptoms""",0.582103,-0.707275,abnormal,abnormal,2.004306,0.574618,vomting
3,Contrave,-0.2971,Weight Loss,"""Contrave combines drugs that were used for al...",0.582103,0.522701,abnormal,abnormal,0.584688,-0.514686,vomting and headache
4,LEVORA,-0.329871,Birth Control,"""I was on this pill for almost two years. It d...",-1.551509,-0.707275,abnormal,abnormal,-0.83493,-0.774044,headache and dizziness


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
valid_indices = y_train.notnull()

X_train = X_train[valid_indices]
y_train = y_train[valid_indices]

In [None]:
print("Missing Values in y_train After Removal:")
print(y_train.isnull().sum())


Missing Values in y_train After Removal:
0


In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    print(f"\n{model_name} Metrics:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

In [None]:
encoder = LabelEncoder()

# Iterate through all columns in X and encode any object (string) columns
for col in X.select_dtypes(include=['object']).columns:
    X[col] = encoder.fit_transform(X[col])

# Now proceed with your train_test_split and model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training Set Shape: {X_train.shape}, Testing Set Shape: {X_test.shape}")

# ... (rest of your code) ...

Training Set Shape: (319, 10), Testing Set Shape: (80, 10)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training Set Shape: {X_train.shape}, Testing Set Shape: {X_test.shape}")


Training Set Shape: (319, 10), Testing Set Shape: (80, 10)


In [None]:
print("\nTraining Decision Tree Model...")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
evaluate_model(dt_model, X_test, y_test, "Decision Tree")


Training Decision Tree Model...

Decision Tree Metrics:
Accuracy: 1.00
Classification Report:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              precision    recall  f1-score   support

                                                                                  

In [None]:
print("\nTraining Random Forest Model...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
evaluate_model(rf_model, X_test, y_test, "Random Forest")


Training Random Forest Model...

Random Forest Metrics:
Accuracy: 1.00
Classification Report:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              precision    recall  f1-score   support

                                                                                  

In [None]:
print("\nTraining SVM Model...")
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
evaluate_model(svm_model, X_test, y_test, "SVM")


Training SVM Model...

SVM Metrics:
Accuracy: 1.00
Classification Report:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              precision    recall  f1-score   support

                                                                                                      