1. Import required libraries and components.

In [None]:
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Utility function to generate a synthetic data set
from data.generation import generate_synthetic_dataset

2. Establish variables and parameters that are used throughout this notebook.

In [None]:
file_path = 'dataset.csv'
num_rows = 10_000
num_columns = 6
num_relevant_columns = 4

test_size = 0.2  # percent data to reserve for test
random_state = 42  # Random seed for reproducibility

3. Generate a CSV file that contains synthetic, randomized data for a classification problem.
The generated data set fits in memory.

In [None]:
generate_synthetic_dataset(
    num_features=num_columns,
    num_samples=num_rows,
    file_path=file_path,
    num_informative_features=num_relevant_columns,
    num_classes=2
)

4. Prepare the data into test and train subsets of features and target variables.

In [None]:
data = pd.read_csv("dataset.csv")
X = data.drop('target', axis=1)
y = data['target']

# Generate train and test splits
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=0
)

5. Train a model by using the typical, non-streaming technique.
The code trains the model by loading the full data set in memory.

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)

print("Classification report:")
print(classification_report(y_test, y_predicted))

6. Define a function to add more complex (columns) features to the data set.
Generating features by transforming and combining the existing ones is a common technique to improve a model's accuracy.

> NOTE: The following function implements only a memory intensive loop to illustrate the objective of the exercise.

In [None]:
def preprocess(df):
    variations = [df]
    for i in range(10000):
        variations.append(df.copy() * i)

    df_extended = pd.concat(variations, axis=1)

    return df_extended

7. Attempt to generate features and train the model with the extended data set by using the typical, non-streaming technique.
Doing so fails, the kernel crashes because the extended data does not fit in memory all at once.

In [None]:
X_preprocessed = preprocess(X_train)

model = GaussianNB()
model.fit(X_preprocessed, y_train)

7. Open the `incremental-training.ipynb` notebook and follow the instructions to scale the training.