In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# Check for nulls or zeros in key columns
df.info()
df.describe()

# Optionally replace 0s with NaN in specific columns
cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols] = df[cols].replace(0, pd.NA)
df.dropna(inplace=True)

# Separate features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7721518987341772
              precision    recall  f1-score   support

           0       0.80      0.87      0.83        52
           1       0.70      0.59      0.64        27

    accuracy                           0.77        79
   macro avg       0.75      0.73      0.74        79
weighted avg       0.77      0.77      0.77        79



In [5]:
import joblib
joblib.dump(model, "diabetes_prediction_model.pkl")


['diabetes_prediction_model.pkl']

In [6]:
import joblib
import numpy as np

# Load the trained model
model = joblib.load("diabetes_prediction_model.pkl")


In [7]:
# Example patient data (from test set or custom)
# Format: [Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age]

sample_input = np.array([[2, 120, 70, 25, 80, 33.6, 0.627, 50]])

# Predict
prediction = model.predict(sample_input)
result = "Diabetic" if prediction[0] == 1 else "Not Diabetic"
print("Prediction:", result)


Prediction: Not Diabetic




In [8]:
import pandas as pd

columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

sample_dict = {
    'Pregnancies': [2],
    'Glucose': [120],
    'BloodPressure': [70],
    'SkinThickness': [25],
    'Insulin': [80],
    'BMI': [33.6],
    'DiabetesPedigreeFunction': [0.627],
    'Age': [50]
}

# Create DataFrame
sample_df = pd.DataFrame(sample_dict)

# Predict
prediction = model.predict(sample_df)
result = "Diabetic" if prediction[0] == 1 else "Not Diabetic"
print("Prediction:", result)


Prediction: Not Diabetic


In [21]:
from sklearn.ensemble import RandomForestClassifier


In [14]:
import pandas as pd

heart_df = pd.read_csv("heart.csv")  # use uploaded file
heart_df.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [15]:
heart_df.info()
heart_df.describe()
heart_df.isnull().sum()  # check for null values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


Unnamed: 0,0
Age,0
Sex,0
ChestPainType,0
RestingBP,0
Cholesterol,0
FastingBS,0
RestingECG,0
MaxHR,0
ExerciseAngina,0
Oldpeak,0


In [16]:
from sklearn.preprocessing import LabelEncoder

df = heart_df.copy()  # work on a copy

# List of categorical columns
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Apply label encoding
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [17]:
X_heart = df.drop('HeartDisease', axis=1)
y_heart = df['HeartDisease']


In [18]:
import joblib
joblib.dump(model_heart, "heart_disease_prediction_model.pkl")


NameError: name 'model_heart' is not defined

In [22]:
model = RandomForestClassifier()
model.fit(X_train, y_train)


In [23]:
model_heart = RandomForestClassifier()
model_heart.fit(X_train, y_train)


In [24]:
import joblib
joblib.dump(model_heart, "heart_disease_prediction_model.pkl")


['heart_disease_prediction_model.pkl']

In [25]:
# Test the model with the first record in the test set (or any custom input)
sample = X_test.iloc[0:1]  # Take the first row of the test set
prediction = model_heart.predict(sample)

# Output the result
result = "Has Heart Disease" if prediction[0] == 1 else "No Heart Disease"
print(f"Prediction for this sample: {result}")


Prediction for this sample: No Heart Disease


In [26]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load and convert to DataFrame
cancer = load_breast_cancer()
X_cancer = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y_cancer = pd.Series(cancer.target)


In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, test_size=0.2, random_state=42)


In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model_cancer = LogisticRegression(max_iter=10000)
model_cancer.fit(X_train, y_train)

y_pred = model_cancer.predict(X_test)
print("Breast Cancer Prediction Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Breast Cancer Prediction Accuracy: 0.956140350877193
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        43
           1       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [29]:
import joblib
joblib.dump(model_cancer, "breast_cancer_prediction_model.pkl")


['breast_cancer_prediction_model.pkl']

In [30]:
sample = X_test.iloc[0:1]
prediction = model_cancer.predict(sample)
result = "Benign" if prediction[0] == 1 else "Malignant"

print("Prediction:", result)


Prediction: Benign
