In [None]:
import pandas as pd
from google.colab import files
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv("MH.csv")
### Display Original Data ###
print("\n Original Data (First 5 Rows):")
display(df.head())



 Original Data (First 5 Rows):


Unnamed: 0,Patient ID,Age,Gender,Diagnosis,Symptom Severity (1-10),Mood Score (1-10),Sleep Quality (1-10),Physical Activity (hrs/week),Medication,Therapy Type,Treatment Start Date,Treatment Duration (weeks),Stress Level (1-10),Outcome,Treatment Progress (1-10),AI-Detected Emotional State,Adherence to Treatment (%)
0,1,43,Female,Major Depressive Disorder,10,5,8,5,Mood Stabilizers,Interpersonal Therapy,2024-01-25,11,9,Deteriorated,7,Anxious,66
1,2,40,Female,Major Depressive Disorder,9,5,4,7,Antipsychotics,Interpersonal Therapy,2024-02-27,11,7,No Change,7,Neutral,78
2,3,55,Female,Major Depressive Disorder,6,3,4,3,SSRIs,Mindfulness-Based Therapy,2024-03-20,14,7,Deteriorated,5,Happy,62
3,4,34,Female,Major Depressive Disorder,6,3,6,5,SSRIs,Mindfulness-Based Therapy,2024-03-29,8,8,Deteriorated,10,Excited,72
4,5,52,Male,Panic Disorder,7,6,6,8,Anxiolytics,Interpersonal Therapy,2024-03-18,12,5,Deteriorated,6,Excited,63


In [None]:
### 1. Handling Missing Values ###
print("\n Missing Values Before Cleaning:")
print(df.isnull().sum())


 Missing Values Before Cleaning:
Patient ID                      0
Age                             0
Gender                          0
Diagnosis                       0
Symptom Severity (1-10)         0
Mood Score (1-10)               0
Sleep Quality (1-10)            0
Physical Activity (hrs/week)    0
Medication                      0
Therapy Type                    0
Treatment Start Date            0
Treatment Duration (weeks)      0
Stress Level (1-10)             0
Outcome                         0
Treatment Progress (1-10)       0
AI-Detected Emotional State     0
Adherence to Treatment (%)      0
dtype: int64


In [None]:
### 2. Removing Duplicates ###
print("\n Duplicates Before:", df.duplicated().sum())


 Duplicates Before: 0


In [None]:
### 3. Correcting Data Types ###
# Convert date column to datetime if applicable
if 'Treatment Start Date' in df.columns:
    df['Treatment Start Date'] = pd.to_datetime(df['Treatment Start Date'])

print(df.dtypes)


Patient ID                               int64
Age                                      int64
Gender                                  object
Diagnosis                               object
Symptom Severity (1-10)                  int64
Mood Score (1-10)                        int64
Sleep Quality (1-10)                     int64
Physical Activity (hrs/week)             int64
Medication                              object
Therapy Type                            object
Treatment Start Date            datetime64[ns]
Treatment Duration (weeks)               int64
Stress Level (1-10)                      int64
Outcome                                 object
Treatment Progress (1-10)                int64
AI-Detected Emotional State             object
Adherence to Treatment (%)               int64
dtype: object


In [None]:
### 4. Handling Outliers (IQR Method) ###
# Get numerical features/columns from the DataFrame 'df'
num_cols = df.select_dtypes(include=['number']).columns

# Then, proceed with outlier detection
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
df_before = df.shape[0]
df = df[~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
df_after = df.shape[0]

print(f"\nRows Before Outlier Removal: {df_before}")
print(f"Rows After Outlier Removal: {df_after}")
display(df.head())


Rows Before Outlier Removal: 500
Rows After Outlier Removal: 500


Unnamed: 0,Patient ID,Age,Gender,Diagnosis,Symptom Severity (1-10),Mood Score (1-10),Sleep Quality (1-10),Physical Activity (hrs/week),Medication,Therapy Type,Treatment Start Date,Treatment Duration (weeks),Stress Level (1-10),Outcome,Treatment Progress (1-10),AI-Detected Emotional State,Adherence to Treatment (%)
0,1,43,Female,Major Depressive Disorder,10,5,8,5,Mood Stabilizers,Interpersonal Therapy,2024-01-25,11,9,Deteriorated,7,Anxious,66
1,2,40,Female,Major Depressive Disorder,9,5,4,7,Antipsychotics,Interpersonal Therapy,2024-02-27,11,7,No Change,7,Neutral,78
2,3,55,Female,Major Depressive Disorder,6,3,4,3,SSRIs,Mindfulness-Based Therapy,2024-03-20,14,7,Deteriorated,5,Happy,62
3,4,34,Female,Major Depressive Disorder,6,3,6,5,SSRIs,Mindfulness-Based Therapy,2024-03-29,8,8,Deteriorated,10,Excited,72
4,5,52,Male,Panic Disorder,7,6,6,8,Anxiolytics,Interpersonal Therapy,2024-03-18,12,5,Deteriorated,6,Excited,63


In [None]:
### 5. Standardizing & Normalizing Data ###
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print("\nData After Normalization (First 5 Rows):")
display(df.head())


Data After Normalization (First 5 Rows):


Unnamed: 0,Patient ID,Age,Gender,Diagnosis,Symptom Severity (1-10),Mood Score (1-10),Sleep Quality (1-10),Physical Activity (hrs/week),Medication,Therapy Type,Treatment Start Date,Treatment Duration (weeks),Stress Level (1-10),Outcome,Treatment Progress (1-10),AI-Detected Emotional State,Adherence to Treatment (%)
0,0.0,0.595238,Female,Major Depressive Disorder,1.0,0.4,0.8,0.444444,Mood Stabilizers,Interpersonal Therapy,2024-01-25,0.375,0.8,Deteriorated,0.4,Anxious,0.2
1,0.002004,0.52381,Female,Major Depressive Disorder,0.8,0.4,0.0,0.666667,Antipsychotics,Interpersonal Therapy,2024-02-27,0.375,0.4,No Change,0.4,Neutral,0.6
2,0.004008,0.880952,Female,Major Depressive Disorder,0.2,0.0,0.0,0.222222,SSRIs,Mindfulness-Based Therapy,2024-03-20,0.75,0.4,Deteriorated,0.0,Happy,0.066667
3,0.006012,0.380952,Female,Major Depressive Disorder,0.2,0.0,0.4,0.444444,SSRIs,Mindfulness-Based Therapy,2024-03-29,0.0,0.6,Deteriorated,1.0,Excited,0.4
4,0.008016,0.809524,Male,Panic Disorder,0.4,0.6,0.4,0.777778,Anxiolytics,Interpersonal Therapy,2024-03-18,0.5,0.0,Deteriorated,0.2,Excited,0.1


In [None]:
### 6. Fixing Text Inconsistencies ###
df['Gender'] = df['Gender'].str.strip().str.lower().replace({'m': 'male', 'f': 'female'})

print("\nGender Column After Standardization:")
print(df['Gender'].value_counts())
display(df.head())



Gender Column After Standardization:
Gender
male      266
female    234
Name: count, dtype: int64


Unnamed: 0,Patient ID,Age,Gender,Diagnosis,Symptom Severity (1-10),Mood Score (1-10),Sleep Quality (1-10),Physical Activity (hrs/week),Medication,Therapy Type,Treatment Start Date,Treatment Duration (weeks),Stress Level (1-10),Outcome,Treatment Progress (1-10),AI-Detected Emotional State,Adherence to Treatment (%)
0,0.0,0.595238,female,Major Depressive Disorder,1.0,0.4,0.8,0.444444,Mood Stabilizers,Interpersonal Therapy,2024-01-25,0.375,0.8,Deteriorated,0.4,Anxious,0.2
1,0.002004,0.52381,female,Major Depressive Disorder,0.8,0.4,0.0,0.666667,Antipsychotics,Interpersonal Therapy,2024-02-27,0.375,0.4,No Change,0.4,Neutral,0.6
2,0.004008,0.880952,female,Major Depressive Disorder,0.2,0.0,0.0,0.222222,SSRIs,Mindfulness-Based Therapy,2024-03-20,0.75,0.4,Deteriorated,0.0,Happy,0.066667
3,0.006012,0.380952,female,Major Depressive Disorder,0.2,0.0,0.4,0.444444,SSRIs,Mindfulness-Based Therapy,2024-03-29,0.0,0.6,Deteriorated,1.0,Excited,0.4
4,0.008016,0.809524,male,Panic Disorder,0.4,0.6,0.4,0.777778,Anxiolytics,Interpersonal Therapy,2024-03-18,0.5,0.0,Deteriorated,0.2,Excited,0.1


In [None]:
### 7. Encoding Categorical Variables ###
# Identify categorical columns (excluding numerical ones)
cat_cols = df.select_dtypes(include=['object', 'category']).columns

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("\nData After Encoding Categorical Variables (First 5 Rows):")
display(df.head())


Data After Encoding Categorical Variables (First 5 Rows):


Unnamed: 0,Patient ID,Age,Symptom Severity (1-10),Mood Score (1-10),Sleep Quality (1-10),Physical Activity (hrs/week),Treatment Start Date,Treatment Duration (weeks),Stress Level (1-10),Treatment Progress (1-10),...,Therapy Type_Dialectical Behavioral Therapy,Therapy Type_Interpersonal Therapy,Therapy Type_Mindfulness-Based Therapy,Outcome_Improved,Outcome_No Change,AI-Detected Emotional State_Depressed,AI-Detected Emotional State_Excited,AI-Detected Emotional State_Happy,AI-Detected Emotional State_Neutral,AI-Detected Emotional State_Stressed
0,0.0,0.595238,1.0,0.4,0.8,0.444444,2024-01-25,0.375,0.8,0.4,...,False,True,False,False,False,False,False,False,False,False
1,0.002004,0.52381,0.8,0.4,0.0,0.666667,2024-02-27,0.375,0.4,0.4,...,False,True,False,False,True,False,False,False,True,False
2,0.004008,0.880952,0.2,0.0,0.0,0.222222,2024-03-20,0.75,0.4,0.0,...,False,False,True,False,False,False,False,True,False,False
3,0.006012,0.380952,0.2,0.0,0.4,0.444444,2024-03-29,0.0,0.6,1.0,...,False,False,True,False,False,False,True,False,False,False
4,0.008016,0.809524,0.4,0.6,0.4,0.777778,2024-03-18,0.5,0.0,0.2,...,False,True,False,False,False,False,True,False,False,False


In [None]:
### 8. Removing Unnecessary Columns ###
if 'Patient ID' in df.columns:
    df.drop(columns=['Patient ID'], inplace=True)

print("\nData After Removing Unnecessary Columns (First 5 Rows):")
display(df.head())



Data After Removing Unnecessary Columns (First 5 Rows):


Unnamed: 0,Age,Symptom Severity (1-10),Mood Score (1-10),Sleep Quality (1-10),Physical Activity (hrs/week),Treatment Start Date,Treatment Duration (weeks),Stress Level (1-10),Treatment Progress (1-10),Adherence to Treatment (%),...,Therapy Type_Dialectical Behavioral Therapy,Therapy Type_Interpersonal Therapy,Therapy Type_Mindfulness-Based Therapy,Outcome_Improved,Outcome_No Change,AI-Detected Emotional State_Depressed,AI-Detected Emotional State_Excited,AI-Detected Emotional State_Happy,AI-Detected Emotional State_Neutral,AI-Detected Emotional State_Stressed
0,0.595238,1.0,0.4,0.8,0.444444,2024-01-25,0.375,0.8,0.4,0.2,...,False,True,False,False,False,False,False,False,False,False
1,0.52381,0.8,0.4,0.0,0.666667,2024-02-27,0.375,0.4,0.4,0.6,...,False,True,False,False,True,False,False,False,True,False
2,0.880952,0.2,0.0,0.0,0.222222,2024-03-20,0.75,0.4,0.0,0.066667,...,False,False,True,False,False,False,False,True,False,False
3,0.380952,0.2,0.0,0.4,0.444444,2024-03-29,0.0,0.6,1.0,0.4,...,False,False,True,False,False,False,True,False,False,False
4,0.809524,0.4,0.6,0.4,0.777778,2024-03-18,0.5,0.0,0.2,0.1,...,False,True,False,False,False,False,True,False,False,False


In [None]:
### Save the Cleaned Data ###
cleaned_file_path = "cleaned_MH.csv"
df.to_csv(cleaned_file_path, index=False)

# Provide download link
files.download(cleaned_file_path)

print("\nData Cleaning Complete! Download 'cleaned_MH.csv' from the link above.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Data Cleaning Complete! Download 'cleaned_MH.csv' from the link above.


## **# FINDING TARGET COLUMN**

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("cleaned_MH.csv")

# Display column names
print("Available Columns in Dataset:")
print(df.columns)


Available Columns in Dataset:
Index(['Age', 'Symptom Severity (1-10)', 'Mood Score (1-10)',
       'Sleep Quality (1-10)', 'Physical Activity (hrs/week)',
       'Treatment Start Date', 'Treatment Duration (weeks)',
       'Stress Level (1-10)', 'Treatment Progress (1-10)',
       'Adherence to Treatment (%)', 'Gender_male',
       'Diagnosis_Generalized Anxiety', 'Diagnosis_Major Depressive Disorder',
       'Diagnosis_Panic Disorder', 'Medication_Antipsychotics',
       'Medication_Anxiolytics', 'Medication_Benzodiazepines',
       'Medication_Mood Stabilizers', 'Medication_SSRIs',
       'Therapy Type_Dialectical Behavioral Therapy',
       'Therapy Type_Interpersonal Therapy',
       'Therapy Type_Mindfulness-Based Therapy', 'Outcome_Improved',
       'Outcome_No Change', 'AI-Detected Emotional State_Depressed',
       'AI-Detected Emotional State_Excited',
       'AI-Detected Emotional State_Happy',
       'AI-Detected Emotional State_Neutral',
       'AI-Detected Emotional State_

In [None]:
for col in df.columns:
    print(f"\n🔹 Unique values in '{col}':")
    print(df[col].unique()[:10])  # Display first 10 unique values



🔹 Unique values in 'Age':
[0.5952381  0.52380952 0.88095238 0.38095238 0.80952381 0.23809524
 0.97619048 0.33333333 0.07142857 0.57142857]

🔹 Unique values in 'Symptom Severity (1-10)':
[1.  0.8 0.2 0.4 0.6 0. ]

🔹 Unique values in 'Mood Score (1-10)':
[0.4 0.  0.6 0.8 1.  0.2]

🔹 Unique values in 'Sleep Quality (1-10)':
[0.8 0.  0.4 0.2 0.6 1. ]

🔹 Unique values in 'Physical Activity (hrs/week)':
[0.44444444 0.66666667 0.22222222 0.77777778 0.33333333 0.11111111
 1.         0.         0.55555556 0.88888889]

🔹 Unique values in 'Treatment Start Date':
['2024-01-25' '2024-02-27' '2024-03-20' '2024-03-29' '2024-03-18'
 '2024-01-11' '2024-02-21' '2024-02-03' '2024-01-16' '2024-01-03']

🔹 Unique values in 'Treatment Duration (weeks)':
[0.375 0.75  0.    0.5   0.625 0.875 0.25  0.125 1.   ]

🔹 Unique values in 'Stress Level (1-10)':
[0.8 0.4 0.6 0.  1.  0.2]

🔹 Unique values in 'Treatment Progress (1-10)':
[0.4 0.  1.  0.2 0.6 0.8]

🔹 Unique values in 'Adherence to Treatment (%)':
[0.2    

In [None]:
for col in df.select_dtypes(include=['object', 'category']).columns:
    print(f"\n🔹 Value counts for '{col}':")
    print(df[col].value_counts())



🔹 Value counts for 'Treatment Start Date':
Treatment Start Date
2024-03-25    13
2024-01-16    13
2024-03-23    12
2024-01-24    10
2024-01-26     9
              ..
2024-02-11     2
2024-02-10     2
2024-04-05     1
2024-04-07     1
2024-01-29     1
Name: count, Length: 101, dtype: int64


# **# DATA BALANCING USING SMOTE TECHNIQUE**





In [None]:
!pip install imbalanced-learn




In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("cleaned_MH.csv")

# Define target variable
target_col = 'AI-Detected Emotional State_Stressed'

# Convert 'Treatment Start Date' to datetime if it exists
if 'Treatment Start Date' in df.columns:
    df['Treatment Start Date'] = pd.to_datetime(df['Treatment Start Date'])
    df['Treatment Start Date_day'] = df['Treatment Start Date'].dt.day
    df['Treatment Start Date_month'] = df['Treatment Start Date'].dt.month
    df['Treatment Start Date_year'] = df['Treatment Start Date'].dt.year
    # Drop the original date column
    df.drop(columns=['Treatment Start Date'], inplace=True)

# Splitting features and target (after date conversion)
X = df.drop(columns=[target_col])
y = df[target_col]

# Apply SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine into a new balanced DataFrame
df_balanced = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled, columns=[target_col])], axis=1)

print("Class Distribution After SMOTE:")
print(df_balanced[target_col].value_counts())

Class Distribution After SMOTE:
AI-Detected Emotional State_Stressed
False    421
True     421
Name: count, dtype: int64


#  **# TRAINING MODEL**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the preprocessed dataset
df = pd.read_csv("cleaned_MH.csv")
print("Dataset Loaded Successfully. First 5 rows:")
print(df.head())



Dataset Loaded Successfully. First 5 rows:
        Age  Symptom Severity (1-10)  Mood Score (1-10)  Sleep Quality (1-10)  \
0  0.595238                      1.0                0.4                   0.8   
1  0.523810                      0.8                0.4                   0.0   
2  0.880952                      0.2                0.0                   0.0   
3  0.380952                      0.2                0.0                   0.4   
4  0.809524                      0.4                0.6                   0.4   

   Physical Activity (hrs/week) Treatment Start Date  \
0                      0.444444           2024-01-25   
1                      0.666667           2024-02-27   
2                      0.222222           2024-03-20   
3                      0.444444           2024-03-29   
4                      0.777778           2024-03-18   

   Treatment Duration (weeks)  Stress Level (1-10)  Treatment Progress (1-10)  \
0                       0.375                  0.8  

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of training and testing sets
print("Training Features Shape:", X_train.shape)
print("Testing Features Shape:", X_test.shape)
print("Training Labels Shape:", y_train.shape)
print("Testing Labels Shape:", y_test.shape)


Training Features Shape: (400, 28)
Testing Features Shape: (100, 28)
Training Labels Shape: (400,)
Testing Labels Shape: (100,)


In [None]:
print(X_train.dtypes)


Age                                            float64
Symptom Severity (1-10)                        float64
Mood Score (1-10)                              float64
Sleep Quality (1-10)                           float64
Physical Activity (hrs/week)                   float64
Treatment Start Date                            object
Treatment Duration (weeks)                     float64
Stress Level (1-10)                            float64
Treatment Progress (1-10)                      float64
Adherence to Treatment (%)                     float64
Gender_male                                       bool
Diagnosis_Generalized Anxiety                     bool
Diagnosis_Major Depressive Disorder               bool
Diagnosis_Panic Disorder                          bool
Medication_Antipsychotics                         bool
Medication_Anxiolytics                            bool
Medication_Benzodiazepines                        bool
Medication_Mood Stabilizers                       bool
Medication

In [None]:
# Drop 'Treatment Start Date' if it's still present
if 'Treatment Start Date' in X.columns:
    X = X.drop(columns=['Treatment Start Date'])

# Convert any object (string) columns to numeric (if necessary)
X = X.apply(pd.to_numeric, errors='coerce')  # Converts invalid entries to NaN
X = X.fillna(0)  # Replace NaN with 0 (or use another strategy)

# Re-split the cleaned data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

print("Model training completed!")


Model training completed!


In [None]:
# Step 1: Make Predictions
y_pred = model.predict(X_test)
print("Sample Predictions:", y_pred[:10])  # Print first 10 predictions

# Step 2: Calculate Accuracy in Percentage
accuracy = accuracy_score(y_test, y_pred) * 100  # Convert to percentage
print(f"Model Accuracy: {accuracy:.2f}%")  # Print accuracy with 2 decimal places

Sample Predictions: [False False False False False False False False False False]
Model Accuracy: 83.00%


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import scipy.stats as stats
import numpy as np

# Step 1: Make Predictions
y_pred = model.predict(X_test)
print("Sample Predictions:", y_pred[:10])  # Print first 10 predictions

# Step 2: Calculate Accuracy in Percentage
accuracy = accuracy_score(y_test, y_pred) * 100  # Convert to percentage
print(f"Model Accuracy: {accuracy:.2f}%")

# Step 3: Calculate Precision
precision = precision_score(y_test, y_pred, zero_division=1)  # Avoid division errors
print(f"Precision: {precision:.2f}")

# Step 4: Calculate Recall
recall = recall_score(y_test, y_pred, zero_division=1)
print(f"Recall: {recall:.2f}")

# Step 5: Calculate F1 Score
f1 = f1_score(y_test, y_pred, zero_division=1)
print(f"F1 Score: {f1:.2f}")

# Step 6: Compute Z-Score for Predictions
z_score = (np.mean(y_pred) - np.mean(y_test)) / np.std(y_test) if np.std(y_test) != 0 else 0
print(f"Z-Score: {z_score:.2f}")

# Step 7: Generate Classification Report
report = classification_report(y_test, y_pred)
print("\nClassification Report:\n", report)

# Step 8: Compute Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\n Confusion Matrix:\n", conf_matrix)


Sample Predictions: [False False False False False False False False False False]
Model Accuracy: 83.00%
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
Z-Score: -0.41

Classification Report:
               precision    recall  f1-score   support

       False       0.84      0.99      0.91        84
        True       0.00      0.00      0.00        16

    accuracy                           0.83       100
   macro avg       0.42      0.49      0.45       100
weighted avg       0.70      0.83      0.76       100


 Confusion Matrix:
 [[83  1]
 [16  0]]


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# Step 1: Apply SMOTE to Balance the Training Data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f" Data Balanced Using SMOTE: {np.bincount(y_train_balanced)}")

# Step 2: Train the Model on Balanced Data
model_smote = RandomForestClassifier(n_estimators=100, random_state=42)
model_smote.fit(X_train_balanced, y_train_balanced)
print(" Model Training Completed with SMOTE Data")

# Step 3: Get Predictions on Test Data
y_pred_smote = model_smote.predict(X_test)

# Step 4: Compute Metrics (in Percentage)
accuracy = accuracy_score(y_test, y_pred_smote) * 100
precision = precision_score(y_test, y_pred_smote, zero_division=1) * 100
recall = recall_score(y_test, y_pred_smote, zero_division=1) * 100
f1 = f1_score(y_test, y_pred_smote, zero_division=1) * 100

# Step 5: Compute Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_smote)

# Step 6: Compute Z-Score
z_score = (np.mean(y_pred_smote) - np.mean(y_test)) / np.std(y_test) if np.std(y_test) != 0 else 0

# Step 7: Print Results
print(f" Adjusted Accuracy (SMOTE): {accuracy:.2f}%")
print(f" Adjusted Precision (SMOTE): {precision:.2f}%")
print(f" Adjusted Recall (SMOTE): {recall:.2f}%")
print(f" Adjusted F1 Score (SMOTE): {f1:.2f}%")
print(f" Adjusted Z-Score (SMOTE): {z_score:.2f}")

print("\n Adjusted Confusion Matrix (SMOTE):")
print(conf_matrix)


 Data Balanced Using SMOTE: [337 337]
 Model Training Completed with SMOTE Data
 Adjusted Accuracy (SMOTE): 83.00%
 Adjusted Precision (SMOTE): 45.45%
 Adjusted Recall (SMOTE): 31.25%
 Adjusted F1 Score (SMOTE): 37.04%
 Adjusted Z-Score (SMOTE): -0.14

 Adjusted Confusion Matrix (SMOTE):
[[78  6]
 [11  5]]


In [None]:
print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE:", np.bincount(y_train_balanced))


Before SMOTE: [337  63]
After SMOTE: [337 337]


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# ✅ STEP 1: Train & Evaluate Model BEFORE SMOTE
model_before = RandomForestClassifier(n_estimators=100, random_state=42)
model_before.fit(X_train, y_train)  # Train on imbalanced data
y_pred_before = model_before.predict(X_test)  # Predict on test data

# Compute Accuracy Before SMOTE
accuracy_before = accuracy_score(y_test, y_pred_before) * 100  # Convert to percentage
print(f"🚀 Accuracy Before SMOTE: {accuracy_before:.2f}%")

# ✅ STEP 2: Apply SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"✅ Data Balanced Using SMOTE: {np.bincount(y_train_balanced)}")

# ✅ STEP 3: Train & Evaluate Model AFTER SMOTE
model_after = RandomForestClassifier(n_estimators=100, random_state=42)
model_after.fit(X_train_balanced, y_train_balanced)  # Train on balanced data
y_pred_after = model_after.predict(X_test)  # Predict on test data

# Compute Accuracy After SMOTE
accuracy_after = accuracy_score(y_test, y_pred_after) * 100  # Convert to percentage
print(f"🚀 Accuracy After SMOTE: {accuracy_after:.2f}%")


🚀 Accuracy Before SMOTE: 83.00%
✅ Data Balanced Using SMOTE: [337 337]
🚀 Accuracy After SMOTE: 83.00%
