In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

n_samples = 750  
genders = ['Male', 'Female']
areas = ['Rural', 'Sub-urban', 'Urban']
eyes = ['Left', 'Right', 'Both']
locations = ['Nasal', 'Temporal', 'Bi-headed']
complications = ['Large scar', 'Haematoma', 'Tennon’s prolapse', 'Retracted Graft', 'Edema', 'Normal']
refraction_axes = [45, 90, 180]


def random_duration(max_years=5):
    months = random.randint(1, max_years * 12)
    return months

def random_date(start_days_ago=365, max_days_offset=30):
    base_date = datetime.now() - timedelta(days=random.randint(0, start_days_ago))
    return base_date.date()

def refraction_values():
    return round(np.random.uniform(-2, 2), 2), round(np.random.uniform(-2, 2), 2), random.choice(refraction_axes)


data = []
for i in range(n_samples):
    age = random.randint(20, 80)
    gender = random.choice(genders)
    area = random.choices(areas, weights=[0.6, 0.2, 0.2])[0]  # Rural dominant

    # Symptoms
    inflamed = random.choice([0, 1])
    dim_vision = random.choice([0, 1])
    cosmetic = random.choice([0, 1])
    symptom_duration = random_duration()
    affected_eye = random.choice(eyes)

    # Past ocular history
    past_surgery = random.choice([0, 1])
    past_surgery_duration = random_duration() if past_surgery else 0

    recurrent = 1 if past_surgery and random.random() < 0.5 else 0

    # Medical history
    has_diabetes = random.choice([0, 1])
    diabetes_duration = random_duration() if has_diabetes else 0
    has_hypertension = random.choice([0, 1])
    hypertension_duration = random_duration() if has_hypertension else 0

    # Pre-op data
    preop_vision_left = round(np.random.uniform(0.1, 1.0), 2)
    preop_vision_right = round(np.random.uniform(0.1, 1.0), 2)
    ref_sph_l, ref_cyl_l, ref_axis_l = refraction_values()
    ref_sph_r, ref_cyl_r, ref_axis_r = refraction_values()
    grade = random.randint(1, 4)
    location = random.choice(locations)
    keratometry_left = round(np.random.uniform(40, 48), 2)
    keratometry_right = round(np.random.uniform(40, 48), 2)

    # Surgery info
    surgery_date = random_date()

    # Follow-up 1
    followup1_date = surgery_date + timedelta(days=7)
    f1_vision_l = round(np.random.uniform(0.1, 1.0), 2)
    f1_vision_r = round(np.random.uniform(0.1, 1.0), 2)
    f1_sph_l, f1_cyl_l, _ = refraction_values()
    f1_sph_r, f1_cyl_r, _ = refraction_values()
    f1_keratometry_l = round(np.random.uniform(40, 48), 2)
    f1_keratometry_r = round(np.random.uniform(40, 48), 2)
    f1_comp = random.choices(complications, weights=[0.04, 0.04, 0.03, 0.03, 0.06, 0.5])[0]

    # Follow-up 2
    followup2_date = surgery_date + timedelta(days=30)
    f2_vision_l = round(np.random.uniform(0.1, 1.0), 2)
    f2_vision_r = round(np.random.uniform(0.1, 1.0), 2)
    f2_sph_l, f2_cyl_l, _ = refraction_values()
    f2_sph_r, f2_cyl_r, _ = refraction_values()
    f2_keratometry_l = round(np.random.uniform(40, 48), 2)
    f2_keratometry_r = round(np.random.uniform(40, 48), 2)
    f2_comp = random.choices(complications, weights=[0.04, 0.04, 0.03, 0.03, 0.06, 0.75])[0]

    data.append([
        i+1, age, gender, area, inflamed, dim_vision, cosmetic, symptom_duration, affected_eye,
        past_surgery, past_surgery_duration, recurrent,
        has_diabetes, diabetes_duration, has_hypertension, hypertension_duration,
        preop_vision_left, preop_vision_right,
        ref_sph_l, ref_cyl_l, ref_axis_l,
        ref_sph_r, ref_cyl_r, ref_axis_r,
        grade, location, keratometry_left, keratometry_right,
        "CAG", surgery_date,
        followup1_date, f1_vision_l, f1_vision_r, f1_sph_l, f1_cyl_l, f1_sph_r, f1_cyl_r, f1_keratometry_l, f1_keratometry_r, f1_comp,
        followup2_date, f2_vision_l, f2_vision_r, f2_sph_l, f2_cyl_l, f2_sph_r, f2_cyl_r, f2_keratometry_l, f2_keratometry_r, f2_comp
    ])

# Column names
columns = [
    'patient_id', 'age', 'gender', 'area', 'symptom_inflamed_eye', 'symptom_diminution_vision', 'symptom_cosmetic',
    'symptom_duration_months', 'affected_eye', 'past_surgery', 'past_surgery_duration_months', 'recurrent',
    'has_diabetes', 'diabetes_duration_months', 'has_hypertension', 'hypertension_duration_months',
    'preop_vision_left', 'preop_vision_right',
    'preop_refraction_sph_left', 'preop_refraction_cyl_left', 'preop_refraction_axis_left',
    'preop_refraction_sph_right', 'preop_refraction_cyl_right', 'preop_refraction_axis_right',
    'pterygium_grade', 'pterygium_location', 'preop_keratometry_left', 'preop_keratometry_right',
    'surgery_type', 'surgery_date',
    'followup1_date', 'followup1_vision_left', 'followup1_vision_right', 'followup1_refraction_sph_left',
    'followup1_refraction_cyl_left', 'followup1_refraction_sph_right', 'followup1_refraction_cyl_right',
    'followup1_keratometry_left', 'followup1_keratometry_right', 'followup1_complication',
    'followup2_date', 'followup2_vision_left', 'followup2_vision_right', 'followup2_refraction_sph_left',
    'followup2_refraction_cyl_left', 'followup2_refraction_sph_right', 'followup2_refraction_cyl_right',
    'followup2_keratometry_left', 'followup2_keratometry_right', 'followup2_complication'
]

df = pd.DataFrame(data, columns=columns)
df.head()


Unnamed: 0,patient_id,age,gender,area,symptom_inflamed_eye,symptom_diminution_vision,symptom_cosmetic,symptom_duration_months,affected_eye,past_surgery,...,followup2_date,followup2_vision_left,followup2_vision_right,followup2_refraction_sph_left,followup2_refraction_cyl_left,followup2_refraction_sph_right,followup2_refraction_cyl_right,followup2_keratometry_left,followup2_keratometry_right,followup2_complication
0,1,53,Male,Rural,1,0,0,17,Both,0,...,2024-08-13,0.59,0.96,0.36,-1.77,0.73,1.37,42.64,44.05,Normal
1,2,49,Female,Sub-urban,1,1,0,15,Both,1,...,2024-08-07,0.72,0.31,0.44,-0.0,-0.96,1.24,46.31,41.39,Edema
2,3,32,Male,Rural,0,1,1,28,Right,0,...,2024-07-01,0.28,0.48,1.78,-0.92,0.54,1.12,41.0,41.03,Normal
3,4,25,Female,Rural,0,1,0,31,Left,0,...,2024-09-20,0.25,0.78,-0.03,1.44,1.18,1.67,42.02,46.12,Normal
4,5,53,Female,Sub-urban,1,0,1,39,Right,0,...,2025-02-07,0.78,0.19,1.63,1.92,1.4,1.63,41.55,43.69,Normal


In [2]:
df.to_csv("original_data.csv",index=False)