In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
import random

def generate_data_sample():
  age = random.randint(20, 80)
  height = random.randint(150, 200)
  weight = random.randint(50, 100)
  systolic_bp = random.randint(100, 180)
  diastolic_bp = random.randint(60, 100)
  diabetes = random.randint(0, 1)
  smoker = random.randint(0, 1)
  heart_disease = random.randint(0, 1)
  diagnosis = ["healthy", "risk"][diabetes * smoker * heart_disease]

  return [age, height, weight, systolic_bp, diastolic_bp, diabetes, smoker, heart_disease, diagnosis]

data_samples = []
for _ in range(10000):
  data_samples.append(generate_data_sample())
data = pd.DataFrame(data_samples, columns=["age", "height", "weight", "systolic_bp", "diastolic_bp", "diabetes", "smoker", "heart_disease", "diagnosis"])
#print(data_samples)

In [3]:
print(data.columns)
data.head(5)

Index(['age', 'height', 'weight', 'systolic_bp', 'diastolic_bp', 'diabetes',
       'smoker', 'heart_disease', 'diagnosis'],
      dtype='object')


Unnamed: 0,age,height,weight,systolic_bp,diastolic_bp,diabetes,smoker,heart_disease,diagnosis
0,44,191,53,167,83,0,1,0,healthy
1,78,189,65,124,70,1,0,0,healthy
2,21,194,50,120,70,0,1,0,healthy
3,62,187,61,110,80,1,1,1,risk
4,66,189,79,120,76,0,1,0,healthy


In [4]:
# Encode categorical columns using one-hot encoding
data = pd.get_dummies(data, columns=['diabetes', 'smoker', 'heart_disease'])
data

Unnamed: 0,age,height,weight,systolic_bp,diastolic_bp,diagnosis,diabetes_0,diabetes_1,smoker_0,smoker_1,heart_disease_0,heart_disease_1
0,44,191,53,167,83,healthy,True,False,False,True,True,False
1,78,189,65,124,70,healthy,False,True,True,False,True,False
2,21,194,50,120,70,healthy,True,False,False,True,True,False
3,62,187,61,110,80,risk,False,True,False,True,False,True
4,66,189,79,120,76,healthy,True,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,42,197,50,136,80,healthy,False,True,False,True,True,False
9996,70,188,64,131,90,healthy,True,False,True,False,True,False
9997,24,156,75,139,65,healthy,True,False,False,True,True,False
9998,67,155,81,165,77,risk,False,True,False,True,False,True


In [5]:
# Preprocess data
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

In [9]:
# Predict disease risk on test data
y_pred = clf.predict(X_test_scaled)

In [10]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [11]:
# Example patient data for prediction
new_patient_data = [[45, 165, 70, 120, 80, 1, 0, 0, 0, 1, 0]]
new_patient_data_scaled = scaler.transform(new_patient_data)



In [12]:
# Predict disease risk for the new patient
prediction = clf.predict(new_patient_data_scaled)
print(f"Predicted Disease Risk: {prediction[0]}")

Predicted Disease Risk: healthy


In [13]:
new_data = """55	175	85	140	90	no	yes	yes
30	160	55	110	70	no	no	no
68	180	75	130	85	yes	no	yes
50	170	80	135	85	no	yes	no
40	155	60	125	80	no	no	no
58	160	65	140	95	yes	yes	yes
35	175	72	128	82	no	no	no
47	168	78	132	88	yes	no	no"""
new_data = new_data.replace("yes", "0\t1")
new_data = new_data.replace("no", "1\t0")
new_data = new_data.split("\n")
new_data = [i.split("\t") for i in new_data]
for i in new_data:
  for j in i:
    i[i.index(j)] = int(j)
print(new_data)

[[55, 175, 85, 140, 90, 1, 0, 0, 1, 0, 1], [30, 160, 55, 110, 70, 1, 0, 1, 0, 1, 0], [68, 180, 75, 130, 85, 0, 1, 1, 0, 0, 1], [50, 170, 80, 135, 85, 1, 0, 0, 1, 1, 0], [40, 155, 60, 125, 80, 1, 0, 1, 0, 1, 0], [58, 160, 65, 140, 95, 0, 1, 0, 1, 0, 1], [35, 175, 72, 128, 82, 1, 0, 1, 0, 1, 0], [47, 168, 78, 132, 88, 0, 1, 1, 0, 1, 0]]


In [14]:
new_data_scaled = scaler.transform(new_data)



In [15]:
prediction = clf.predict(new_data_scaled)

In [16]:
prediction

array(['healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'risk',
       'healthy', 'healthy'], dtype=object)

In [17]:
'''
healthy
risk
healthy
risk
risk
healthy
risk
healthy
risk
'''

'\nhealthy\nrisk\nhealthy\nrisk\nrisk\nhealthy\nrisk\nhealthy\nrisk\n'