In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import pytesseract
from PIL import Image
import matplotlib.pyplot as plt

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
data=pd.read_csv("realistic_health_dataset.csv")
data.head(5)

Unnamed: 0,Name,Age,Gender,HbA1c (%),Blood Pressure (mmHg),BMI,Cholesterol (mg/dL)
0,Destiny Castillo,48,Male,8.7,130/90,19.9,225
1,Matthew Smith,48,Male,8.3,104/67,27.4,200
2,April Johnson,70,Male,7.5,126/74,30.5,164
3,Nathaniel Ellison,51,Female,9.3,104/70,34.3,171
4,Heather Shaffer,45,Female,6.3,103/75,33.1,235


In [None]:
# data cleaning for ensuring there is no nill and repeated data in the data set

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Name                   100 non-null    object 
 1   Age                    100 non-null    int64  
 2   Gender                 100 non-null    object 
 3   HbA1c (%)              100 non-null    float64
 4   Blood Pressure (mmHg)  100 non-null    object 
 5   BMI                    100 non-null    float64
 6   Cholesterol (mg/dL)    100 non-null    int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 5.6+ KB


In [5]:
data.isnull().sum()#check if there are null values if present we have to remove it 

Name                     0
Age                      0
Gender                   0
HbA1c (%)                0
Blood Pressure (mmHg)    0
BMI                      0
Cholesterol (mg/dL)      0
dtype: int64

In [6]:
duplicates=data.duplicated()

In [7]:
duplicates

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97    False
98    False
99    False
Length: 100, dtype: bool

In [8]:
data.describe()

Unnamed: 0,Age,HbA1c (%),BMI,Cholesterol (mg/dL)
count,100.0,100.0,100.0,100.0
mean,49.96,7.294,26.53,201.74
std,17.416421,1.578378,4.665985,27.158577
min,18.0,4.5,18.2,150.0
25%,36.0,6.075,22.5,179.0
50%,49.5,7.2,27.1,206.5
75%,64.0,8.625,30.025,223.25
max,80.0,10.0,34.5,250.0


In [9]:
data['Blood Pressure (mmHg)'].describe()

count        100
unique        99
top       100/63
freq           2
Name: Blood Pressure (mmHg), dtype: object

In [42]:
'''plt.bar('age', 'HbA1c(%)', color='orange' )
plt.title("represenation of % of HbA1c with resepct to age")
plt.xlabel('age')
plt.ylabel('HbA1c(%)')'''

'plt.bar(\'age\', \'HbA1c(%)\', color=\'orange\' )\nplt.title("represenation of % of HbA1c with resepct to age")\nplt.xlabel(\'age\')\nplt.ylabel(\'HbA1c(%)\')'

In [13]:
from sklearn.preprocessing import LabelEncoder
le_gender = LabelEncoder()
data['Gender'] = le_gender.fit_transform(data['Gender'])

In [14]:
data

Unnamed: 0,Name,Age,Gender,HbA1c (%),Blood Pressure (mmHg),BMI,Cholesterol (mg/dL)
0,Destiny Castillo,48,1,8.7,130/90,19.9,225
1,Matthew Smith,48,1,8.3,104/67,27.4,200
2,April Johnson,70,1,7.5,126/74,30.5,164
3,Nathaniel Ellison,51,0,9.3,104/70,34.3,171
4,Heather Shaffer,45,0,6.3,103/75,33.1,235
...,...,...,...,...,...,...,...
95,Dawn Beard,76,1,7.0,110/78,30.0,192
96,Paul Simmons,52,1,9.1,128/70,33.3,176
97,Mitchell Moss,64,0,8.4,107/81,27.4,207
98,Daniel Gill,42,1,9.4,135/83,22.9,224


In [33]:
import pytesseract
from PIL import Image
import pandas as pd
import numpy as np

# Function to extract text from image
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

# Function to preprocess lab data
def preprocess_lab_data(text):
    text = text.lower()
    
    lab_data = {}
    if 'hba1c: ' in text:
        lab_data['HbA1c'] = float(text.split('hba1c: ')[1].split('%')[0])
    else:
        lab_data['HbA1c'] = None
    
    if 'blood pressure: ' in text:
        lab_data['BloodPressure'] = text.split('blood pressure: ')[1].split('/')[0] + '/' + text.split('blood pressure: ')[1].split('/')[1].split(' ')[0]
    else:
        lab_data['BloodPressure'] = None
    
    if 'cholesterol: ' in text:
        lab_data['Cholesterol'] = int(text.split('cholesterol: ')[1].split(' ')[0])
    else:
        lab_data['Cholesterol'] = None
    
    if 'bmi: ' in text:
        lab_data['BMI'] = float(text.split('bmi: ')[1].split(' ')[0])
    else:
        lab_data['BMI'] = None
    
    if 'medications: ' in text:
        prescription_data = text.split('medications: ')[1].split(';')
    else:
        prescription_data = []
    
    return lab_data, prescription_data

# Function to preprocess wearable data
def preprocess_wearable_data(wearable_data):
    wearable_data['HeartRate'] = np.mean(wearable_data['HeartRate'])
    wearable_data['SleepHours'] = np.sum(wearable_data['SleepHours'])
    wearable_data['Steps'] = np.sum(wearable_data['Steps'])
    
    return wearable_data

# Function to analyze patient data
def analyze_patient_data(lab_data, prescription_data, wearable_data):
    disease = []
    recommendations = []
    observations = []

    # Check for Diabetes
    if lab_data['HbA1c'] and lab_data['HbA1c'] > 6.5 or any('metformin' in med for med in prescription_data):
        disease.append('Diabetes')
        recommendations.append("Maintain a healthy diet, exercise regularly, and monitor blood sugar levels.")
        observations.append("Lab Report Analysis: High HbA1c indicates diabetes.")
        observations.append("Prescription Analysis: Prescribed diabetes medication.")
    
    # Check for Hypertension
    if lab_data['BloodPressure'] and int(lab_data['BloodPressure'].split('/')[0]) > 130 or any('lisinopril' in med for med in prescription_data):
        disease.append('Hypertension')
        recommendations.append("Reduce salt intake, exercise regularly, and monitor blood pressure levels.")
        observations.append("Lab Report Analysis: High blood pressure indicates hypertension.")
        observations.append("Prescription Analysis: Prescribed hypertension medication.")

    # Analyze wearable data
    if wearable_data['HeartRate'] > 100:
        observations.append("Wearable Data Analysis: Elevated heart rate detected.")
    
    if wearable_data['SleepHours'] < 6:
        observations.append("Wearable Data Analysis: Insufficient sleep detected.")
    
    if wearable_data['Steps'] < 5000:
        observations.append("Wearable Data Analysis: Low physical activity detected.")
    
    return disease, recommendations, observations

# Example input data
lab_report_path = 'WhatsApp Image 2025-01-21 at 19.11.48_48acb824.jpg'
prescription_path = 'prescription.jpg'
wearable_data = {
    'HeartRate': [75, 80, 85, 90],
    'SleepHours': [7, 6, 5, 6],
    'Steps': [3000, 5000, 4000, 6000]
}

# Process the images and wearable data
lab_report_text = extract_text_from_image(lab_report_path)
prescription_text = extract_text_from_image(prescription_path)

labx = lab_report_text + prescription_text
lab_data, prescription_data = preprocess_lab_data(labx)
wearable_data = preprocess_wearable_data(wearable_data)

# Analyze the patient data
disease, recommendations, observations = analyze_patient_data(lab_data, prescription_data, wearable_data)

# Print the results
print("Diseases:", disease)
print("Recommendations:", recommendations)
print("Observations:")
for observation in observations:
    print("-", observation)


Diseases: ['Hypertension']
Recommendations: ['Reduce salt intake, exercise regularly, and monitor blood pressure levels.']
Observations:
- Lab Report Analysis: High blood pressure indicates hypertension.
- Prescription Analysis: Prescribed hypertension medication.
