In [None]:
import numpy as np
import pandas as pd

def generate_dataset(n, seed=42):
    np.random.seed(seed)

    RBC = np.random.normal(6.5, 1.0, n).clip(4.0, 9.0)
    WBC = np.random.normal(10, 3.0, n).clip(4.0, 18.0)
    HB  = np.random.normal(13, 2.5, n).clip(7.0, 20.0)
    Platelets = np.random.normal(300, 80, n).clip(100, 700)
    Creatinine = np.random.normal(1.0, 0.3, n).clip(0.3, 2.5)
    Glucose    = np.random.normal(100, 25, n).clip(50, 200)
    Dose = np.random.uniform(0.3, 1.3, n)

    Outcome = (
        (HB > 12) &
        (RBC > 5.5) &
        (Creatinine < 1.3) &
        (Dose < 1.0)
    ).astype(int)

    return pd.DataFrame({
        "RBC": RBC,
        "WBC": WBC,
        "HB": HB,
        "Platelets": Platelets,
        "Creatinine": Creatinine,
        "Glucose": Glucose,
        "Dose": Dose,
        "Outcome": Outcome
    })


**Notebook diagnostic:** Run the next code cell to show which Python executable this notebook kernel is using and whether numpy is importable. If numpy is missing, run the third cell to install it into this kernel.

In [None]:
import sys, subprocess
print('Notebook sys.executable:', sys.executable)
try:
    import numpy as np
    print('Imported numpy, version:', np.__version__)
except Exception as e:
    print('Import numpy failed:', repr(e))
print('\npip show numpy:')
subprocess.run([sys.executable, '-m', 'pip', 'show', 'numpy'])


In [None]:
# Run this cell only if numpy is missing in the kernel above.
import sys, subprocess
print('Installing numpy into:', sys.executable)
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'numpy'])


In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)
# Number of samples
n = 1000

# Simulate hematology parameters within realistic biological ranges
RBC = np.random.normal(loc=6.5, scale=1.0, size=n).clip(4.0, 9.0)          # 4–9 ×10^6/µL
WBC = np.random.normal(loc=10, scale=3.0, size=n).clip(4.0, 18.0)          # 4–18 ×10^3/µL
HB  = np.random.normal(loc=13, scale=2.5, size=n).clip(7.0, 20.0)          # 7–20 g/dL
Platelets = np.random.normal(loc=300, scale=80, size=n).clip(100, 700)     # 100–700 ×10^3/µL
Creatinine = np.random.normal(loc=1.0, scale=0.3, size=n).clip(0.3, 2.5)   # 0.3–2.5 mg/dL
Glucose    = np.random.normal(loc=100, scale=25, size=n).clip(50, 200)     # 50–200 mg/dL

# Dose varies between low and high doses
Dose = np.random.uniform(0.3, 1.3, n)

# More realistic outcome rule (multifactor)
Outcome = (
    (HB > 12) & 
    (RBC > 5.5) &
    (Creatinine < 1.3) &
    (Dose < 1.0)
).astype(int)

# Construct dataframe
df = pd.DataFrame({
    "RBC": RBC,
    "WBC": WBC,
    "HB": HB,
    "Platelets": Platelets,
    "Creatinine": Creatinine,
    "Glucose": Glucose,
    "Dose": Dose,
    "Outcome": Outcome
})

df.head(), df.shape
df.to_csv("smart_vet_dose_1000.csv", index=False)



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred = logreg.predict(X_test_scaled)

accuracy_log = accuracy_score(y_test, y_pred)
accuracy_log


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, rf_pred)
accuracy_rf


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=4)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

accuracy_xgb = accuracy_score(y_test, xgb_pred)
accuracy_xgb


In [None]:
# Model evaluation function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def evaluate_models(df):
    X = df.drop("Outcome", axis=1)
    y = df["Outcome"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    results = {}

    # Logistic Regression
    lr = LogisticRegression()
    lr.fit(X_train_scaled, y_train)
    lr_pred = lr.predict(X_test_scaled)

    results["Logistic Regression"] = {
        "Accuracy": accuracy_score(y_test, lr_pred),
        "Precision": precision_score(y_test, lr_pred, zero_division=0),
        "Recall": recall_score(y_test, lr_pred, zero_division=0),
        "F1": f1_score(y_test, lr_pred, zero_division=0)
    }

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)

    results["Random Forest"] = {
        "Accuracy": accuracy_score(y_test, rf_pred),
        "Precision": precision_score(y_test, rf_pred, zero_division=0),
        "Recall": recall_score(y_test, rf_pred, zero_division=0),
        "F1": f1_score(y_test, rf_pred, zero_division=0)
    }

    # XGBoost
    xgb = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=4, use_label_encoder=False, eval_metric='logloss')
    xgb.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)

    results["XGBoost"] = {
        "Accuracy": accuracy_score(y_test, xgb_pred),
        "Precision": precision_score(y_test, xgb_pred, zero_division=0),
        "Recall": recall_score(y_test, xgb_pred, zero_division=0),
        "F1": f1_score(y_test, xgb_pred, zero_division=0)
    }

    return results


In [None]:
# Run experiments for different dataset sizes
import pandas as pd

dataset_sizes = [400, 700, 1000, 1250]
final_results = []

for size in dataset_sizes:
    df = generate_dataset(size)
    model_results = evaluate_models(df)

    for model, metrics in model_results.items():
        final_results.append({
            "Dataset Size": size,
            "Model": model,
            **metrics
        })

results_df = pd.DataFrame(final_results)
# Pivot for nicer display
results_df_pivot = results_df.pivot_table(index=["Dataset Size", "Model"], values=["Accuracy", "Precision", "Recall", "F1"]).reset_index()

# Save results
results_df.to_csv("model_results.csv", index=False)

results_df_pivot

In [None]:
# Show results summary
import pandas as pd
results_df = pd.read_csv("model_results.csv")
results_df_pivot = results_df.pivot_table(index=["Dataset Size", "Model"], values=["Accuracy","Precision","Recall","F1"]).reset_index()
results_df_pivot

In [None]:
# Quick numeric summary of results
import pandas as pd
results_df = pd.read_csv('model_results.csv')
print('Rows:', len(results_df))
print(results_df.groupby('Model')['Accuracy'].agg(['mean','min','max']).round(3))
results_df.head()

In [None]:
# Visualizations: ensure plotting libs installed, then bar plots of metrics and confusion matrix for Random Forest on 1000-sample dataset
import sys, subprocess
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
except Exception:
    print('Installing matplotlib and seaborn...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib', 'seaborn'])
    import matplotlib.pyplot as plt
    import seaborn as sns

from sklearn.metrics import confusion_matrix

# read results
results_df = pd.read_csv('model_results.csv')

plt.figure(figsize=(10, 5))
sns.barplot(data=results_df, x='Model', y='Accuracy')
plt.title('Accuracy by Model (all sizes)')
plt.ylim(0, 1.02)
plt.show()

# Show metrics per dataset size
metrics = ['Accuracy', 'Precision', 'Recall', 'F1']
fig, axes = plt.subplots(1, len(metrics), figsize=(18, 4))
for i, metric in enumerate(metrics):
    sns.barplot(ax=axes[i], data=results_df, x='Dataset Size', y=metric, hue='Model')
    axes[i].set_title(metric)
    axes[i].set_ylim(0, 1.02)
plt.tight_layout()
plt.show()

# Confusion matrix for Random Forest on 1000-sample dataset
df_1000 = generate_dataset(1000)
X = df_1000.drop('Outcome', axis=1)
y = df_1000['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix — Random Forest (n=1000)')
plt.show()