In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import time

In [6]:
df = pd.read_csv("week_11_data_pima.csv")

In [8]:
# List of dataset sizes to test
sizes = [100, 1000, 10000, 100000, 1000000, 10000000]

# Prepare results list
results = []

for size in sizes:
    print(f"\nTesting dataset size: {size}")

    # Duplicate data if needed
    df_sample = pd.concat([df] * (size // len(df) + 1), ignore_index=True).iloc[:size]

    # Split features and target
    X = df_sample.drop("outcome", axis=1)
    y = df_sample["outcome"]

    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize model
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', verbosity=0)

    # Time model fitting and cross-validation
    start = time.time()
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    elapsed = time.time() - start

    # Fit and predict on test set
    model.fit(X_train, y_train)
    test_accuracy = accuracy_score(y_test, model.predict(X_test))

    # Store results
    results.append({
        "Dataset Size": size,
        "CV Accuracy": round(scores.mean(), 4),
        "Test Accuracy": round(test_accuracy, 4),
        "Time (s)": round(elapsed, 2)
    })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print("\nSummary:")
print(results_df)


Testing dataset size: 100

Testing dataset size: 1000

Testing dataset size: 10000

Testing dataset size: 100000

Testing dataset size: 1000000

Testing dataset size: 10000000

Summary:
   Dataset Size  CV Accuracy  Test Accuracy  Time (s)
0           100       0.8500         0.9500      0.96
1          1000       0.9400         0.9700      0.65
2         10000       0.9720         0.9820      2.03
3        100000       0.9859         0.9866      4.06
4       1000000       0.9915         0.9923     34.95
5      10000000       0.9931         0.9932    339.80
