In [2]:
import numpy as np
import pandas as pd
import joblib

In [3]:
model = joblib.load('random_forest.pkl')

test_data = pd.DataFrame({
    'Pregnancies': [0, 8, 2, 1, 10, 3],
    'Glucose': [99, 181, 85, 155, 200, 140],
    'BloodPressure': [72, 90, 60, 88, 70, 80],
    'SkinThickness': [20, 0, None, 40, 35, 25],  # 0 = missing
    'Insulin': [0, 550, 0, 0, 0, 80],        # 0 = missing
    'BMI': [28.0, 33.6, 22.1, 35.2, 38.9, 29.0],
    'DiabetesPedigreeFunction': [0.299, 0.672, 0.248, 0.527, 2.420, 0.375],
    'Age': [21, 55, 30, 42, None, 45] 
       })       # NaN test

new_data = pd.DataFrame(test_data)

columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']

new_data = new_data[columns]

predict = model.predict(new_data)

In [4]:
predict

array([0, 1, 0, 1, 1, 1])

In [5]:
new_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,99,72,20.0,0,28.0,0.299,21.0
1,8,181,90,0.0,550,33.6,0.672,55.0
2,2,85,60,,0,22.1,0.248,30.0
3,1,155,88,40.0,0,35.2,0.527,42.0
4,10,200,70,35.0,0,38.9,2.42,
5,3,140,80,25.0,80,29.0,0.375,45.0


## hyperparameter tunning

In [6]:
# # %% 
# # Step 1: Import Libraries
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.impute import KNNImputer
# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline as ImbPipeline
# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve
# import matplotlib.pyplot as plt
# import joblib
# import seaborn as sns

# # %% 
# # Step 2: Load and Prepare Data
# df = pd.read_csv('/home/suhas/Desktop/office/Data Science/Notes/Data Analysis/Data/diabetes.csv')

# # Replace 0s with NaN in biological features
# zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
# df[zero_cols] = df[zero_cols].replace(0, np.nan)

# # Feature Engineering
# df['Glucose_BMI'] = df['Glucose'] * df['BMI']  # Interaction term
# df['Age_Bin'] = pd.cut(df['Age'], bins=[20, 30, 40, 50, 100], labels=False)  # Binning

# X = df.drop('Outcome', axis=1)
# y = df['Outcome']

# # %%
# # Step 3: Create Advanced Pipeline
# full_pipeline = ImbPipeline([
#     ('imputer', KNNImputer(n_neighbors=5)),  # Impute missing values
#     ('scaler', StandardScaler()),            # Scale features
#     ('smote', SMOTE(random_state=42)),       # Handle class imbalance
#     ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))  # XGBoost model
# ])

# # %%
# # Step 4: Hyperparameter Tuning
# param_grid = {
#     'model__n_estimators': [100, 200],
#     'model__max_depth': [3, 5, 7],
#     'model__learning_rate': [0.01, 0.1],
#     'model__scale_pos_weight': [2, 3]        # Penalize misclassifying diabetic cases
# }

# grid_search = GridSearchCV(
#     full_pipeline,
#     param_grid,
#     cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
#     scoring='recall',                        # Focus on minimizing false negatives
#     n_jobs=-1
# )

# grid_search.fit(X, y)

# # %%
# # Step 5: Best Model Evaluation
# best_model = grid_search.best_estimator_
# print(f"Best Parameters: {grid_search.best_params_}")

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# best_model.fit(X_train, y_train)
# y_pred = best_model.predict(X_test)
# y_probs = best_model.predict_proba(X_test)[:, 1]

# # %%
# # Step 6: Metrics & Visualization
# print("\n🔥 Final Performance 🔥")
# print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
# print(classification_report(y_test, y_pred))

# # Precision-Recall Curve
# precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
# plt.figure(figsize=(8, 6))
# plt.plot(recall, precision, marker='.')
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.title('Precision-Recall Curve')
# plt.show()

# # Confusion Matrix
# sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')
# plt.title('Confusion Matrix')
# plt.show()

# # %%
# # Step 7: Save Model
# joblib.dump(best_model, 'optimized_diabetes_model.pkl')

# # %%
# # Step 8: Test Predictions (Using Your Sample Data)
# test_data = pd.DataFrame({
#     'Pregnancies': [0, 8, 2, 1, 10],
#     'Glucose': [99, 181, 85, 155, 200],
#     'BloodPressure': [72, 90, 60, 88, 70],
#     'SkinThickness': [20, 0, 0, 40, 35],
#     'Insulin': [0, 550, 0, 0, 0],
#     'BMI': [28.0, 33.6, 22.1, 35.2, 38.9],
#     'DiabetesPedigreeFunction': [0.299, 0.672, 0.248, 0.527, 2.420],
#     'Age': [21, 55, 30, 42, None],
#     'Glucose_BMI': [99*28, 181*33.6, 85*22.1, 155*35.2, 200*38.9],  # Manually calculated
#     'Age_Bin': [0, 3, 1, 2, 3]  # Corresponding to 21,55,30,42,None
# })

# # Handle nulls using the pipeline
# predictions = best_model.predict(test_data)
# probabilities = best_model.predict_proba(test_data)[:, 1]

# test_data['Predicted_Outcome'] = predictions
# test_data['Probability'] = probabilities
# print("\n🧪 Test Predictions:")
# print(test_data[['Glucose', 'BMI', 'Age', 'Predicted_Outcome', 'Probability']])