<a href="https://colab.research.google.com/github/VasireddyNandini/predictive-analysis/blob/predict_patient_develop_disease/predict_disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# -------------------------
# STEP 1: Load dataset
# -------------------------
df = pd.read_csv('patients.csv')  # 🔁 Replace with your file path

print("✅ Loaded data shape:", df.shape)
print("📊 Columns:", df.columns.tolist())

# -------------------------
# STEP 2: Preprocessing
# -------------------------
# Drop rows with missing values (or use imputation)
df = df.dropna()

# Separate features and target
X = df.drop('disease', axis=1)  # clinical parameters
y = df['disease']               # target: 0 or 1

# Scale features (optional but recommended)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------------
# STEP 3: Train/test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# -------------------------
# STEP 4: Train model
# -------------------------
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# -------------------------
# STEP 5: Evaluate
# -------------------------
y_pred = model.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("📄 Report:\n", classification_report(y_test, y_pred))

# -------------------------
# STEP 6: Predict on new patient data
# -------------------------
def predict_disease(new_patient_dict):
    input_df = pd.DataFrame([new_patient_dict])
    input_scaled = scaler.transform(input_df)
    prediction = model.predict(input_scaled)[0]
    return "Likely to Develop Disease" if prediction == 1 else "Unlikely to Develop Disease"

# Example usage:
new_patient = {
    'age': 55,
    'blood_pressure': 140,
    'cholesterol': 220,
    'glucose': 110,
    'heart_rate': 80,
    # ➕ include all features from the CSV except 'disease'
}

result = predict_disease(new_patient)
print("\n🧠 Prediction for New Patient:", result)


✅ Loaded data shape: (8, 6)
📊 Columns: ['age', 'blood_pressure', 'cholesterol', 'glucose', 'heart_rate', 'disease']
✅ Accuracy: 1.0
📄 Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


🧠 Prediction for New Patient: Likely to Develop Disease
