In [60]:
import json
from datetime import datetime
from scipy.stats import pearsonr

with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)

def calculate_age(birth_date):
    
    if not birth_date:
        return None
    birth_date = datetime.strptime(birth_date, "%Y-%m-%dT%H:%M:%S.%fZ")
    today = datetime.now()
    return today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))

In [61]:
def is_valid_mobile(phone_number):
        
    if phone_number.startswith("+91"):
        phone_number = phone_number[3:]
    elif phone_number.startswith("91"):
        phone_number = phone_number[2:]

    if len(phone_number) == 10 and phone_number.isdigit():
        num = int(phone_number)
        if 6000000000 <= num <= 9999999999:
            return True
    return False


valid_mobile_count = 0

for record in data:
    phone_number = record.get("phoneNumber", "")
    if is_valid_mobile(phone_number):
        valid_mobile_count += 1

valid_mobile_count

18

In [62]:
ages = []
num_medicines = []

for record in data:
    birth_date = record.get("patientDetails", {}).get("birthDate")
    age = calculate_age(birth_date)
    if age is not None:
        ages.append(age)
        medicines = record.get("consultationData", {}).get("medicines", [])
        num_medicines.append(len(medicines))

if len(ages) > 1 and len(num_medicines) > 1:
    correlation, _ = pearsonr(ages, num_medicines)
    print(round(correlation, 2))
else:
    print("Insufficient data for correlation calculation.")


-0.21


In [63]:
missing_counts = {"firstName": 0, "lastName": 0, "birthDate": 0}
total_records = len(data)

for record in data:
    patient = record.get("patientDetails", {})
    if not patient.get("firstName"):
        missing_counts["firstName"] += 1
    if not patient.get("lastName"):
        missing_counts["lastName"] += 1
    if not patient.get("birthDate"):
        missing_counts["birthDate"] += 1

missing_percentages = {col: (count / total_records) * 100 for col, count in missing_counts.items()}
missing_percentages


{'firstName': 0.0,
 'lastName': 70.96774193548387,
 'birthDate': 32.25806451612903}

In [64]:
from collections import Counter

gender_counts = Counter(record.get("patientDetails", {}).get("gender", "") for record in data)
mode_gender = gender_counts.most_common(1)[0][0]

imputed_female_count = sum(
    1 for record in data if record.get("patientDetails", {}).get("gender", mode_gender) == "F"
)
female_percentage = (imputed_female_count / total_records) * 100

round(female_percentage, 2)


32.26

In [65]:
age_groups = {"Child": 0, "Teen": 0, "Adult": 0, "Senior": 0}

for record in data:
    birth_date = record.get("patientDetails", {}).get("birthDate")
    age = calculate_age(birth_date)
    if age is not None:
        if age <= 12:
            age_groups["Child"] += 1
        elif age <= 19:
            age_groups["Teen"] += 1
        elif age <= 59:
            age_groups["Adult"] += 1
        else:
            age_groups["Senior"] += 1

age_groups["Adult"]


21

In [66]:
total_medicines = 0

for record in data:
    medicines = record.get("consultationData", {}).get("medicines", [])
    total_medicines += len(medicines)

average_medicines = total_medicines / total_records
round(average_medicines, 2)


2.13

In [67]:
from collections import Counter

medicine_counter = Counter(
    med["medicineName"]
    for record in data
    for med in record.get("consultationData", {}).get("medicines", [])
)

third_most_frequent_medicine = medicine_counter.most_common(3)[-1][0]
third_most_frequent_medicine


'C'

In [68]:
active_count = 0
inactive_count = 0

for record in data:
    for med in record.get("consultationData", {}).get("medicines", []):
        if med["isActive"]:
            active_count += 1
        else:
            inactive_count += 1

total_medicines = active_count + inactive_count
active_percentage = (active_count / total_medicines) * 100 if total_medicines else 0
inactive_percentage = (inactive_count / total_medicines) * 100 if total_medicines else 0

(round(active_percentage, 2), round(inactive_percentage, 2))


(69.7, 30.3)

In [69]:
results = {
    "missing_percentages": {k: round(v, 2) for k, v in missing_percentages.items()},
    "female_percentage": round(female_percentage, 2),
    "adult_count": age_groups["Adult"],
    "average_medicines": round(average_medicines, 2),
    "third_most_frequent_medicine": third_most_frequent_medicine,
    "medicine_distribution": (round(active_percentage, 2), round(inactive_percentage, 2)),
    "valid_mobile_count": valid_mobile_count,
    "pearson_correlation": round(correlation, 2) if len(ages) > 1 and len(num_medicines) > 1 else "Insufficient data"
}

results


{'missing_percentages': {'firstName': 0.0,
  'lastName': 70.97,
  'birthDate': 32.26},
 'female_percentage': 32.26,
 'adult_count': 21,
 'average_medicines': 2.13,
 'third_most_frequent_medicine': 'C',
 'medicine_distribution': (69.7, 30.3),
 'valid_mobile_count': 18,
 'pearson_correlation': np.float64(-0.21)}