In [1]:
import boto3
import pandas as pd
import psycopg2
from io import StringIO
import glob
import os
from dotenv import load_dotenv

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# For display
pd.set_option('display.max_columns', None)

## A. Upload CSVs to S3 (using boto3)

In [None]:
# AWS S3 Configuration
s3_client = boto3.client('s3')
bucket_name = 'medoptix'
prefix = 'datasets/'

# ✅ Correct path to your Datasets folder (adjust if needed)
local_folder = r'C:\Users\GIDI\Desktop\Folders\REPOSITORY\medoptix-ai-internship\Datasets'
csv_files = glob.glob(os.path.join(local_folder, '*.csv'))

print("🔍 Found CSV files:", csv_files)

for file_path in csv_files:
    try:
        file_name = os.path.basename(file_path)
        s3_key = f"{prefix}{file_name}"
        
        s3_client.upload_file(file_path, bucket_name, s3_key)
        print(f"✅ Uploaded: {file_name} to s3://{bucket_name}/{s3_key}")
    except Exception as e:
        print(f"❌ Failed to upload {file_name}: {e}")


🔍 Found CSV files: ['C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\clinics.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\dropout_flags.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\feedback.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\interventions.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\patients.csv', 'C:\\Users\\GIDI\\Desktop\\Folders\\REPOSITORY\\medoptix-ai-internship\\Datasets\\sessions.csv']
✅ Uploaded: clinics.csv to s3://medoptix/datasets/clinics.csv
✅ Uploaded: dropout_flags.csv to s3://medoptix/datasets/dropout_flags.csv


## B. Load from S3 → PostgreSQL


In [None]:
import pandas as pd
from sqlalchemy import create_engine
import boto3

s3 = boto3.client("s3")


bucket = "medoptix"
prefix = "datasets/"

# Files to download
files = ["patients.csv", "clinics.csv", "sessions.csv", "feedback.csv", "dropout_flags.csv"]

# Download files
for file in files:
    s3.download_file(bucket, prefix + file, file)
    print(f"⬇️ Downloaded {file} from S3")

⬇️ Downloaded patients.csv from S3
⬇️ Downloaded clinics.csv from S3
⬇️ Downloaded sessions.csv from S3
⬇️ Downloaded feedback.csv from S3
⬇️ Downloaded dropout_flags.csv from S3


# Data Modelling & Defining Foreign Key relationships


- patients (PK: patient_id)
- sessions (PK: session_id, FK: patient_id)
- feedback (PK: feedback_id, FK: session_id)


TASK 1 - Model and Define the relationship for the remaining set of dataset
 - clinics
 - dropout_flags
 - intervention.csv

TASK 2 - Create Schema for these dataset and upload then into postgres

## C. Upload Data → PostgreSQL


In [4]:
# PostgreSQL Configuration
db_params = {
    "host": os.getenv("DB_HOST"),
    "port": os.getenv("DB_PORT"),
    "dbname": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "sslmode": os.getenv("DB_SSLMODE")
}

In [2]:
'''import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

# Load environment variables (recommended for security)
load_dotenv()

def get_db_engine():
    """Create and return a SQLAlchemy engine with proper connection string"""
    # Construct connection string from environment variables
    db_url = (
        f"postgresql://{''}:{''}@"
        f"{''}:{''}/{''}?"
    )
    return create_engine(db_url)

def upload_data():
    # Step 1: Read CSV files
    patients = pd.read_csv("medoptix_data\processed\patients.csv")
    sessions = pd.read_csv("medoptix_data\processed\sessions.csv")
    feedback = pd.read_csv("C:/Users/Muham/Downloads/Medoptix_Demo/medoptix_data/processed/feedback.csv")

    # Step 2: Create database engine
    engine = get_db_engine()

    # Step 3: Upload in referential order with error handling
    with engine.begin() as connection:  # Automatically handles transactions
        # Chunk size for large datasets (adjust as needed)
        chunk_size = 1000

        # Upload patients table
        patients.to_sql(
            "patients", 
            connection, 
            if_exists="append", 
            index=False,
            chunksize=chunk_size,
            method='multi'  # Faster for bulk inserts
        )

        # Upload sessions table
        sessions.to_sql(
            "sessions", 
            connection, 
            if_exists="append", 
            index=False,
            chunksize=chunk_size,
            method='multi'
        )

        # Upload feedback table
        feedback.to_sql(
            "feedback", 
            connection, 
            if_exists="append", 
            index=False,
            chunksize=chunk_size,
            method='multi'
        )

    print("✅ Data uploaded successfully with relationships intact.")

if __name__ == "__main__":
    upload_data() '''

  '''import pandas as pd


'import pandas as pd\nfrom sqlalchemy import create_engine\nimport os\nfrom dotenv import load_dotenv\n\n# Load environment variables (recommended for security)\nload_dotenv()\n\ndef get_db_engine():\n    """Create and return a SQLAlchemy engine with proper connection string"""\n    # Construct connection string from environment variables\n    db_url = (\n        f"postgresql://{\'\'}:{\'\'}@"\n        f"{\'\'}:{\'\'}/{\'\'}?"\n    )\n    return create_engine(db_url)\n\ndef upload_data():\n    # Step 1: Read CSV files\n    patients = pd.read_csv("medoptix_data\\processed\\patients.csv")\n    sessions = pd.read_csv("medoptix_data\\processed\\sessions.csv")\n    feedback = pd.read_csv("C:/Users/Muham/Downloads/Medoptix_Demo/medoptix_data/processed/feedback.csv")\n\n    # Step 2: Create database engine\n    engine = get_db_engine()\n\n    # Step 3: Upload in referential order with error handling\n    with engine.begin() as connection:  # Automatically handles transactions\n        # Chunk

## D. Read Data → PostgreSQL - (Prepare data for EDA)


In [None]:
from sqlalchemy import create_engine
import pandas as pd


def get_db_engine():
    """Create and return a SQLAlchemy engine with proper connection string"""
    # Construct connection string from environment variables
    db_url = (
        f"postgresql://{''}:{''}@"
        f"{''}:{''}/{''}?"
    )
    return create_engine(db_url)


engine = get_db_engine()


# Query to fetch data from the tables
patients_query = "SELECT * FROM patients"
sessions_query = "SELECT * FROM sessions"
feedback_query = "SELECT * FROM feedback"

# Load data into pandas DataFrame
patients_df = pd.read_sql(patients_query, engine)
sessions_df = pd.read_sql(sessions_query, engine)
feedback_df = pd.read_sql(feedback_query, engine)


