In [3]:
import numpy as np
import pandas as pd

def generate_dataset(n, seed=42):
    np.random.seed(seed)

    RBC = np.random.normal(6.5, 1.0, n).clip(4.0, 9.0)
    WBC = np.random.normal(10, 3.0, n).clip(4.0, 18.0)
    HB  = np.random.normal(13, 2.5, n).clip(7.0, 20.0)
    Platelets = np.random.normal(300, 80, n).clip(100, 700)
    Creatinine = np.random.normal(1.0, 0.3, n).clip(0.3, 2.5)
    Glucose    = np.random.normal(100, 25, n).clip(50, 200)
    Dose = np.random.uniform(0.3, 1.3, n)

    Outcome = (
        (HB > 12) &
        (RBC > 5.5) &
        (Creatinine < 1.3) &
        (Dose < 1.0)
    ).astype(int)

    return pd.DataFrame({
        "RBC": RBC,
        "WBC": WBC,
        "HB": HB,
        "Platelets": Platelets,
        "Creatinine": Creatinine,
        "Glucose": Glucose,
        "Dose": Dose,
        "Outcome": Outcome
    })


**Notebook diagnostic:** Run the next code cell to show which Python executable this notebook kernel is using and whether numpy is importable. If numpy is missing, run the third cell to install it into this kernel.

In [34]:
import sys, subprocess
print('Notebook sys.executable:', sys.executable)
try:
    import numpy as np
    print('Imported numpy, version:', np.__version__)
except Exception as e:
    print('Import numpy failed:', repr(e))
print('\npip show numpy:')
subprocess.run([sys.executable, '-m', 'pip', 'show', 'numpy'])


Notebook sys.executable: c:\Users\ADMIN\OneDrive\Desktop\RP Model\venv\Scripts\python.exe
Imported numpy, version: 2.3.5

pip show numpy:


CompletedProcess(args=['c:\\Users\\ADMIN\\OneDrive\\Desktop\\RP Model\\venv\\Scripts\\python.exe', '-m', 'pip', 'show', 'numpy'], returncode=0)

In [35]:
# Run this cell only if numpy is missing in the kernel above.
import sys, subprocess
print('Installing numpy into:', sys.executable)
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'numpy'])


Installing numpy into: c:\Users\ADMIN\OneDrive\Desktop\RP Model\venv\Scripts\python.exe


0

In [36]:
import numpy as np
import pandas as pd

np.random.seed(42)
# Number of samples
n = 1000

# Simulate hematology parameters within realistic biological ranges
RBC = np.random.normal(loc=6.5, scale=1.0, size=n).clip(4.0, 9.0)          # 4–9 ×10^6/µL
WBC = np.random.normal(loc=10, scale=3.0, size=n).clip(4.0, 18.0)          # 4–18 ×10^3/µL
HB  = np.random.normal(loc=13, scale=2.5, size=n).clip(7.0, 20.0)          # 7–20 g/dL
Platelets = np.random.normal(loc=300, scale=80, size=n).clip(100, 700)     # 100–700 ×10^3/µL
Creatinine = np.random.normal(loc=1.0, scale=0.3, size=n).clip(0.3, 2.5)   # 0.3–2.5 mg/dL
Glucose    = np.random.normal(loc=100, scale=25, size=n).clip(50, 200)     # 50–200 mg/dL

# Dose varies between low and high doses
Dose = np.random.uniform(0.3, 1.3, n)

# More realistic outcome rule (multifactor)
Outcome = (
    (HB > 12) & 
    (RBC > 5.5) &
    (Creatinine < 1.3) &
    (Dose < 1.0)
).astype(int)

# Construct dataframe
df = pd.DataFrame({
    "RBC": RBC,
    "WBC": WBC,
    "HB": HB,
    "Platelets": Platelets,
    "Creatinine": Creatinine,
    "Glucose": Glucose,
    "Dose": Dose,
    "Outcome": Outcome
})

df.head(), df.shape
df.to_csv("smart_vet_dose_1000.csv", index=False)



In [37]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred = logreg.predict(X_test_scaled)

accuracy_log = accuracy_score(y_test, y_pred)
accuracy_log


0.78

In [38]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, rf_pred)
accuracy_rf


0.988

In [39]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=4)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

accuracy_xgb = accuracy_score(y_test, xgb_pred)
accuracy_xgb


0.984

In [1]:
# Model evaluation function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def evaluate_models(df):
    X = df.drop("Outcome", axis=1)
    y = df["Outcome"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    results = {}

    # Logistic Regression
    lr = LogisticRegression()
    lr.fit(X_train_scaled, y_train)
    lr_pred = lr.predict(X_test_scaled)

    results["Logistic Regression"] = {
        "Accuracy": accuracy_score(y_test, lr_pred),
        "Precision": precision_score(y_test, lr_pred, zero_division=0),
        "Recall": recall_score(y_test, lr_pred, zero_division=0),
        "F1": f1_score(y_test, lr_pred, zero_division=0)
    }

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)

    results["Random Forest"] = {
        "Accuracy": accuracy_score(y_test, rf_pred),
        "Precision": precision_score(y_test, rf_pred, zero_division=0),
        "Recall": recall_score(y_test, rf_pred, zero_division=0),
        "F1": f1_score(y_test, rf_pred, zero_division=0)
    }

    # XGBoost
    xgb = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=4, use_label_encoder=False, eval_metric='logloss')
    xgb.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)

    results["XGBoost"] = {
        "Accuracy": accuracy_score(y_test, xgb_pred),
        "Precision": precision_score(y_test, xgb_pred, zero_division=0),
        "Recall": recall_score(y_test, xgb_pred, zero_division=0),
        "F1": f1_score(y_test, xgb_pred, zero_division=0)
    }

    return results


In [4]:
# Run experiments for different dataset sizes
import pandas as pd

dataset_sizes = [400, 700, 1000, 1250]
final_results = []

for size in dataset_sizes:
    df = generate_dataset(size)
    model_results = evaluate_models(df)

    for model, metrics in model_results.items():
        final_results.append({
            "Dataset Size": size,
            "Model": model,
            **metrics
        })

results_df = pd.DataFrame(final_results)
# Pivot for nicer display
results_df_pivot = results_df.pivot_table(index=["Dataset Size", "Model"], values=["Accuracy", "Precision", "Recall", "F1"]).reset_index()

# Save results
results_df.to_csv("model_results.csv", index=False)

results_df_pivot

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,Dataset Size,Model,Accuracy,F1,Precision,Recall
0,400,Logistic Regression,0.74,0.648649,0.615385,0.685714
1,400,Random Forest,1.0,1.0,1.0,1.0
2,400,XGBoost,1.0,1.0,1.0,1.0
3,700,Logistic Regression,0.8,0.710744,0.728814,0.693548
4,700,Random Forest,1.0,1.0,1.0,1.0
5,700,XGBoost,0.988571,0.983871,0.983871,0.983871
6,1000,Logistic Regression,0.78,0.682081,0.7375,0.634409
7,1000,Random Forest,0.988,0.983607,1.0,0.967742
8,1000,XGBoost,0.984,0.978022,1.0,0.956989
9,1250,Logistic Regression,0.805112,0.67027,0.688889,0.652632


In [5]:
# Show results summary
import pandas as pd
results_df = pd.read_csv("model_results.csv")
results_df_pivot = results_df.pivot_table(index=["Dataset Size", "Model"], values=["Accuracy","Precision","Recall","F1"]).reset_index()
results_df_pivot

Unnamed: 0,Dataset Size,Model,Accuracy,F1,Precision,Recall
0,400,Logistic Regression,0.74,0.648649,0.615385,0.685714
1,400,Random Forest,1.0,1.0,1.0,1.0
2,400,XGBoost,1.0,1.0,1.0,1.0
3,700,Logistic Regression,0.8,0.710744,0.728814,0.693548
4,700,Random Forest,1.0,1.0,1.0,1.0
5,700,XGBoost,0.988571,0.983871,0.983871,0.983871
6,1000,Logistic Regression,0.78,0.682081,0.7375,0.634409
7,1000,Random Forest,0.988,0.983607,1.0,0.967742
8,1000,XGBoost,0.984,0.978022,1.0,0.956989
9,1250,Logistic Regression,0.805112,0.67027,0.688889,0.652632


In [None]:
# Quick numeric summary of results
import pandas as pd
results_df = pd.read_csv('model_results.csv')
print('Rows:', len(results_df))
print(results_df.groupby('Model')['Accuracy'].agg(['mean','min','max']).round(3))
results_df.head()