In [4]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
np.random.seed(42)
n_samples = 200
age = np.random.randint(20, 80, size=n_samples)
blood_pressure = np.random.randint(90, 180, size=n_samples)
cholesterol = np.random.randint(150, 300, size=n_samples)
smoking = np.random.randint(0, 2, size=n_samples)
disease = ((age > 50) & (blood_pressure > 130) & (cholesterol > 200) | (smoking == 1)).astype(int)

# Step 3: Convert to DataFrame
df = pd.DataFrame({
    'age': age,
    'blood_pressure': blood_pressure,
    'cholesterol': cholesterol,
    'smoking': smoking,
    'disease': disease
})

print("First 5 rows of the dataset:")
print(df.head())

# Step 4: Split features and target
X = df.drop('disease', axis=1)
y = df['disease']

# Step 5: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 6: Initialize and train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
new_patient = pd.DataFrame({
    'age': [60],
    'blood_pressure': [140],
    'cholesterol': [210],
    'smoking': [1]
})

predicted_class = model.predict(new_patient)[0]
predicted_prob = model.predict_proba(new_patient)[:, 1][0]

print("\nNew patient prediction:")
print("Predicted class (disease=1, no disease=0):", predicted_class)
print("Probability of disease:", round(predicted_prob, 2))
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
})
print("\nLogistic Regression Coefficients:")
print(coefficients)


First 5 rows of the dataset:
   age  blood_pressure  cholesterol  smoking  disease
0   58             160          276        1        1
1   71             148          279        1        1
2   48             175          166        0        0
3   34             117          253        0        0
4   62             155          286        1        1

Model Accuracy: 0.925

Confusion Matrix:
 [[14  0]
 [ 3 23]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      1.00      0.90        14
           1       1.00      0.88      0.94        26

    accuracy                           0.93        40
   macro avg       0.91      0.94      0.92        40
weighted avg       0.94      0.93      0.93        40


New patient prediction:
Predicted class (disease=1, no disease=0): 1
Probability of disease: 0.98

Logistic Regression Coefficients:
          Feature  Coefficient
0             age     0.049113
1  blood_pressure     0.027292
2     