In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

# For display
pd.set_option('display.max_columns', None)

## Feature Engineering

In [8]:
patients_df = pd.read_csv(r"C:\Users\GIDI\Desktop\Folders\REPOSITORY\medoptix-ai-internship\Datasets\processed_feature_engineering\patients.csv")
patients_df['signup_date'] = pd.to_datetime(patients_df['signup_date'], errors='coerce')
# Signup features

patients_df['signup_month'] = patients_df['signup_date'].dt.month
patients_df['signup_dayofweek'] = patients_df['signup_date'].dt.day_name()

# BMI categories
patients_df['bmi_category'] = pd.cut(patients_df['bmi'],
                                     bins=[0, 18.5, 24.9, 29.9, 100],
                                     labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

# Chronic condition binary
patients_df['has_chronic_cond'] = patients_df['chronic_cond'].apply(lambda x: 0 if x == 'No chronic' else 1)

## Sessions_df

In [9]:
sessions_df = pd.read_csv(r"C:\Users\GIDI\Desktop\Folders\REPOSITORY\medoptix-ai-internship\Datasets\raw\sessions.csv")

# Convert date
sessions_df['date'] = pd.to_datetime(sessions_df['date'])

# Sort
sessions_df = sessions_df.sort_values(['patient_id', 'date'])

# Group by patient and engineer session-level features
progress_summary = sessions_df.groupby('patient_id').agg({
    'pain_level': ['first', 'last', 'mean'],
    'satisfaction': 'mean',
    'home_adherence_pc': ['mean', 'std'],
    'duration': 'mean',
    'week': 'nunique'  # to get engagement span
})

# Flatten multi-index columns
progress_summary.columns = ['_'.join(col).strip() for col in progress_summary.columns.values]
progress_summary = progress_summary.reset_index()

# Derived features
progress_summary['pain_delta'] = progress_summary['pain_level_first'] - progress_summary['pain_level_last']
progress_summary['pain_pct_change'] = (
    progress_summary['pain_delta'] / progress_summary['pain_level_first'].replace(0, np.nan)
).fillna(0)

progress_summary.rename(columns={
    'home_adherence_pc_mean': 'avg_home_adherence',
    'home_adherence_pc_std': 'adherence_variability',
    'duration_mean': 'avg_session_duration',
    'satisfaction_mean': 'avg_satisfaction',
    'week_nunique': 'active_weeks'
}, inplace=True)

progress_summary.head()

Unnamed: 0,patient_id,pain_level_first,pain_level_last,pain_level_mean,avg_satisfaction,avg_home_adherence,adherence_variability,avg_session_duration,active_weeks,pain_delta,pain_pct_change
0,1,6,1,2.15,6.5,77.75,19.983875,41.75,12,5,0.833333
1,2,5,1,2.25,7.0,61.625,12.682243,33.875,5,4,0.8
2,3,6,1,2.470588,5.941176,73.705882,13.873377,41.470588,11,5,0.833333
3,4,6,1,2.0,6.647059,69.764706,15.176171,47.588235,12,5,0.833333
4,5,5,1,2.333333,6.416667,73.666667,14.74223,33.833333,8,4,0.8


## Merge sessions_df and patients_df

In [10]:
# Merge with patient demographic/clinical info
full_df = pd.merge(patients_df, progress_summary, on='patient_id', how='left')

# Drop any columns not needed or redundant
selected_cols = [
    'age', 'bmi', 'smoker', 'gender', 'consent',
    'pain_delta', 'pain_pct_change', 'avg_home_adherence',
    'adherence_variability', 'avg_satisfaction',
    'avg_session_duration', 'active_weeks'
]

In [None]:
'''import os

# Define path
save_path = r"C:\Users\GIDI\Desktop\Folders\REPOSITORY\medoptix-ai-internship\Datasets\processed_patient_segmentation"
os.makedirs(save_path, exist_ok=True)

# Save DataFrame
full_df.to_csv(os.path.join(save_path, "full_df.csv"), index=False)'''
