In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import os

# For display
pd.set_option('display.max_columns', None)

## Feature Engineering

In [31]:
patients_df = pd.read_csv(r"C:\Users\GIDI\Desktop\Folders\REPOSITORY\medoptix-ai-internship\Datasets\processed_feature_engineering\patients.csv")
patients_df['signup_date'] = pd.to_datetime(patients_df['signup_date'], errors='coerce')
# Signup features

patients_df['signup_month'] = patients_df['signup_date'].dt.month
patients_df['signup_dayofweek'] = patients_df['signup_date'].dt.day_name()

# BMI categories
patients_df['bmi_category'] = pd.cut(patients_df['bmi'],
                                     bins=[0, 18.5, 24.9, 29.9, 100],
                                     labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

# Chronic condition binary
patients_df['has_chronic_cond'] = patients_df['chronic_cond'].apply(lambda x: 0 if x == 'No chronic' else 1)

## Sessions_df

In [None]:
# Read CSV
sessions_df = pd.read_csv(r"C:\Users\GIDI\Desktop\Folders\REPOSITORY\medoptix-ai-internship\Datasets\raw\sessions.csv")

# Convert 'date' column to datetime
sessions_df['date'] = pd.to_datetime(sessions_df['date'])


progress_summary = (
    sessions_df
    .sort_values(['patient_id', 'date'])    # putting every patient session in calender mode
    .assign(pain_delta=lambda d: (d.groupby("patient_id")["pain_level"].diff() # create a new column help save change in new pain level
    ))
    .groupby("patient_id") #grouped by patient ID -- collect all the rows belonging to each patient
    .agg (
        n_sessions = ("session_id", "count"),           # defining some extra insights into the session data
        avg_session_duration = ("duration", "mean"),
        first_week = ("week", "min"),
        last_week = ("week", "max"),
        mean_pain = ("pain_level", "mean"),
        mean_pain_delta = ("pain_delta", "mean"),
        home_adherence_mean = ("home_adherence_pc", "mean"),
        satisfaction_mean = ("satisfaction", "mean")
    )
)

# Display the first few rows
progress_summary.head()


Unnamed: 0_level_0,n_sessions,avg_session_duration,first_week,last_week,mean_pain,mean_pain_delta,home_adherence_mean,satisfaction_mean
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,20,41.75,1,12,2.15,-0.263158,77.75,6.5
2,8,33.875,1,11,2.25,-0.571429,61.625,7.0
3,17,41.470588,1,12,2.470588,-0.3125,73.705882,5.941176
4,17,47.588235,1,12,2.0,-0.3125,69.764706,6.647059
5,12,33.833333,1,10,2.333333,-0.363636,73.666667,6.416667


## Merge sessions_df and patients_df

In [33]:
# Merge with patient demographic/clinical info
full_df = pd.merge(patients_df, progress_summary, on='patient_id', how='left')

# Set 'patient_id' as index
full_df = full_df.set_index('patient_id')

# Drop any columns not needed or redundant
selected_cols = [
    'age', 'bmi', 'smoker', 'gender', 'consent',
    'pain_delta', 'pain_pct_change', 'avg_home_adherence',
    'adherence_variability', 'avg_satisfaction',
    'avg_session_duration', 'active_weeks'
]

In [None]:
'''# Define path
save_path = r"C:\Users\GIDI\Desktop\Folders\REPOSITORY\medoptix-ai-internship\Datasets\processed_patient_segmentation"
os.makedirs(save_path, exist_ok=True)

# Save DataFrame
full_df.to_csv(os.path.join(save_path, "full_df.csv"), index=True)'''