In [1]:
import boto3
import pandas as pd
import psycopg2
from io import StringIO
import glob
import os
from dotenv import load_dotenv

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# For display
pd.set_option('display.max_columns', None)

## Upload CSVs to S3 (using boto3)

In [None]:
''' # AWS S3 Configuration
s3_client = boto3.client('s3')
bucket_name = 'medoptix'
prefix = 'datasets/'

# Correct path to your Datasets folder (adjust if needed)
local_folder = r'C:\Users\GIDI\Desktop\Folders\REPOSITORY\medoptix-ai-internship\Datasets'
csv_files = glob.glob(os.path.join(local_folder, '*.csv'))

print("🔍 Found CSV files:", csv_files)

for file_path in csv_files:
    try:
        file_name = os.path.basename(file_path)
        s3_key = f"{prefix}{file_name}"
        
        s3_client.upload_file(file_path, bucket_name, s3_key)
        print(f" Uploaded: {file_name} to s3://{bucket_name}/{s3_key}")
    except Exception as e:
        print(f" Failed to upload {file_name}: {e}") '''


🔍 Found CSV files: ['C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\clinics.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\dropout_flags.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\feedback.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\interventions.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\patients.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\sessions.csv']
✅ Uploaded: clinics.csv to s3://medoptix/datasets/clinics.csv
✅ Uploaded: dropout_flags.csv to s3://medoptix/datasets/dropout_flags.csv


## Load from S3 → PostgreSQL


In [5]:
from sqlalchemy import create_engine
import boto3

s3 = boto3.client("s3")


bucket = "medoptix"
prefix = "datasets/"

# Files to download
files = ["patients.csv", "clinics.csv", "sessions.csv", "feedback.csv", "dropout_flags.csv"]

# Download files
'''for file in files:
    s3.download_file(bucket, prefix + file, file)
    print(f"⬇️ Downloaded {file} from S3")'''

'for file in files:\n    s3.download_file(bucket, prefix + file, file)\n    print(f"⬇️ Downloaded {file} from S3")'

# Data Modelling & Defining Foreign Key relationships


- patients (PK: patient_id)
- sessions (PK: session_id, FK: patient_id)
- feedback (PK: feedback_id, FK: session_id)


TASK 1 - Model and Define the relationship for the remaining set of dataset
 - clinics
 - dropout_flags
 - intervention.csv

TASK 2 - Create Schema for these dataset and upload then into postgres

##  Upload Data → PostgreSQL


In [6]:
# PostgreSQL Configuration
db_params = {
    "host": os.getenv("DB_HOST"),
    "port": os.getenv("DB_PORT"),
    "dbname": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "sslmode": os.getenv("DB_SSLMODE")
}

In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv

# Load env credentials for PostgreSQL
load_dotenv()

db_url = (
    f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@"
    f"{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"
)
engine = create_engine(db_url)

# --- S3 CONFIG (pandas + s3fs handles this internally) ---
bucket = "medoptix"
prefix = "datasets/"
files = {
    "patients": f"s3://{bucket}/{prefix}patients.csv",
    "clinics": f"s3://{bucket}/{prefix}clinics.csv",
    "sessions": f"s3://{bucket}/{prefix}sessions.csv",
    "feedback": f"s3://{bucket}/{prefix}feedback.csv",
    "interventions": f"s3://{bucket}/{prefix}interventions.csv",
    "dropout": f"s3://{bucket}/{prefix}dropout_flags.csv",
}

# READ DATA FROM S3 DIRECTLY INTO DATAFRAMES
patients = pd.read_csv(files["patients"], storage_options={"anon": False})
clinics = pd.read_csv(files["clinics"], storage_options={"anon": False})
sessions = pd.read_csv(files["sessions"], storage_options={"anon": False}, parse_dates=['date'])
feedback = pd.read_csv(files["feedback"], storage_options={"anon": False})
interventions = pd.read_csv(files["interventions"], storage_options={"anon": False}, parse_dates=['sent_at'])
dropout = pd.read_csv(files["dropout"], storage_options={"anon": False})

# CREATE TABLES IN POSTGRESQL 
with engine.begin() as conn:
    conn.execute(text("""
        CREATE TABLE IF NOT EXISTS dim_clinics (
            clinic_id INT PRIMARY KEY,
            city TEXT,
            country TEXT,
            type TEXT,
            postcode TEXT,
            capacity INT,
            staff_count INT,
            speciality TEXT,
            avg_rating FLOAT
        );

        CREATE TABLE IF NOT EXISTS dim_patients (
            patient_id INT PRIMARY KEY,
            age INT,
            gender TEXT,
            bmi FLOAT,
            smoker BOOLEAN,
            chronic_cond TEXT,
            injury_type TEXT,
            signup_date TIMESTAMP,
            referral_source TEXT,
            consent BOOLEAN,
            clinic_id INT REFERENCES dim_clinics(clinic_id),
            insurance_type TEXT
        );

        CREATE TABLE IF NOT EXISTS fact_sessions (
            session_id UUID PRIMARY KEY,
            patient_id INT REFERENCES dim_patients(patient_id),
            date TIMESTAMP,
            week INT,
            duration INT,
            pain_level INT,
            exercise_type TEXT,
            home_adherence_pc FLOAT,
            satisfaction INT,
            therapist_id INT
        );

        CREATE TABLE IF NOT EXISTS fact_feedback (
            feedback_id UUID PRIMARY KEY,
            session_id UUID REFERENCES fact_sessions(session_id),
            comments TEXT,
            sentiment FLOAT
        );

        CREATE TABLE IF NOT EXISTS fact_interventions (
            intervention_id UUID PRIMARY KEY,
            patient_id INT REFERENCES dim_patients(patient_id),
            sent_at TIMESTAMP,
            channel TEXT,
            message TEXT,
            responded TEXT
        );

        CREATE TABLE IF NOT EXISTS fact_dropout_flags (
            patient_id INT PRIMARY KEY REFERENCES dim_patients(patient_id),
            dropout BOOLEAN,
            dropout_week INT
        );
    """))

# --- UPLOAD DATA TO POSTGRES ---
clinics.to_sql('dim_clinics', engine, if_exists='append', index=False)
patients.to_sql('dim_patients', engine, if_exists='append', index=False)
sessions.to_sql('fact_sessions', engine, if_exists='append', index=False)
feedback.to_sql('fact_feedback', engine, if_exists='append', index=False)
interventions.to_sql('fact_interventions', engine, if_exists='append', index=False)
dropout.to_sql('fact_dropout_flags', engine, if_exists='append', index=False)

print("All datasets loaded from S3 and inserted into PostgreSQL successfully.")

All datasets loaded from S3 and inserted into PostgreSQL successfully.
