1. Import required libraries and components.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

from data.generation import generate_synthetic_dataset

2. Establish variables and parameters that are used throughout the exercise.

In [2]:
file_path = 'data.csv'
num_features = 10_000            # columns
num_informative_features = 500  # columns w/relevant pattern
num_samples = 19_000          # rows

test_size = 0.2               # percent data to reserve for test
random_state = 42

3. Generate a CSV file that contains randomized data that can fit into system memory.

In [3]:
generate_synthetic_dataset(num_features, num_samples, file_path, num_informative_features)

[K10%

KeyboardInterrupt: 

4. Train a model by using the typical, non-streaming technique.

In [4]:
# Load data
data = pd.read_csv(file_path)
X = data.drop('target', axis=1)
y = data['target']

# Generate train and test splits
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=0
)

# Train a small neural network, which includes 2 layers of 10 neurons
model = SGDClassifier()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)

print("Classification report:")
print(classification_report(y_test, y_predicted))

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1420
           1       1.00      1.00      1.00       580

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [13]:
# Train a larger neural network, which includes 4 layers of 1000 neurons
model = MultinomialNB()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)

print("Classification report:")
print(classification_report(y_test, y_predicted))

ValueError: Negative values in data passed to MultinomialNB (input X)

In [7]:
def read_data_in_batches(X: np.array, y: np.array, batch_size: int):
    num_batches = get_num_batches(X, batch_size)

    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = batch_start + batch_size
        yield X[batch_start:batch_end], y[batch_start:batch_end]

def get_num_batches(array: np.array, batch_size: int):
    num_samples = array.shape[0]
    # rounded division (//) returns int
    return num_samples // batch_size

count = 1
batch_size = 10
num_batches = get_num_batches(X_train, batch_size)
model = SGDClassifier() # MLPClassifier(hidden_layer_sizes=(1000, 1000, 1000, 1000), random_state=1)

# stream data and train incrementally
for X_batch, y_batch in read_data_in_batches(X_train, y_train, batch_size):

    model.partial_fit(X_batch, y_batch, classes=y.unique())

    # print percentage complete
    print(f'\r\033[K{100 * count // num_batches}%', end='')

    count += 1

y_predicted = model.predict(X_test)
print()
print("Classification report:")
print(classification_report(y_test, y_predicted))

[K100%
Classification report:
              precision    recall  f1-score   support

           0       0.85      0.77      0.81       148
           1       0.48      0.62      0.54        52

    accuracy                           0.73       200
   macro avg       0.67      0.69      0.68       200
weighted avg       0.76      0.73      0.74       200



5. Update the variables and generate a larger CSV file.

In [None]:
file_path = 'big_data.csv'
num_samples = 400_000       # rows
chunk_size = 100_000           # rows to process at once
num_chunks = num_samples // chunk_size

6. Generate a large CSV file.
Note that this process can take a few minutes to complete.

In [None]:
# create CSV file and write header
header = [f'feature_{i+1}' for i in range(num_features)] + ['target']
with open(file_path, 'w') as f:
    f.write(','.join(header) + '\n')

for i in range(num_chunks):
    # generate  data with distinct clusters for each class
    X, y = make_classification(n_samples=chunk_size, n_features=num_features,
                               n_informative=num_informative_features, n_redundant=0,
                               n_classes=2, weights=[0.7, 0.3], random_state=random_state)

    # create and append data frame to file
    df = pd.DataFrame(X, columns=[f'feature_{i+1}' for i in range(num_features)])
    df['target'] = y
    df.to_csv(file_path, mode='a', header=False, index=False)

    # print percentage complete
    print(f'\r\033[K{100 * (i+1) // num_chunks}%', end='')

print(f"\nCSV file '{file_path}' generated")

7. Attempt to train the larger model by using the typical, non-streaming technique.
Doing so fails because the kernel crashes when trying to load such a large file into memory all at once.

In [None]:
data = pd.read_csv(file_path)
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=0
)
model = MLPClassifier(hidden_layer_sizes=(1000,1000,1000), random_state=1)
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)

print("Classification report:")
print(classification_report(y_test, y_predicted))

8. Because the Python kernel crashed in the previous step, re-run the `import` instructions and recreate the variables.

In [None]:
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report

num_features = 5              # columns
num_informative_features = 3  # columns w/relevant pattern
test_size = 0.2               # percent data to reserve for test
random_state = 42

file_path = 'big_data.csv'
num_samples = 10_000_000      # rows
chunk_size = 100_000          # rows to process at once
num_chunks = num_samples // chunk_size

9. Train the model by streaming the data from the file in chunks.
By doing so, only some of the data needs to fit into memory at the same time.
Note that this process can take a few minutes to complete.

> NOTE You must use classifiers that work well with piecemeal, or "out-of-core", training.
The `GaussianNB` classifier, or Gaussian naive-Bayes, is one such classifier.

In [None]:
model = GaussianNB()
scaler = StandardScaler()

# initial state
first_chunk = True
count = 1
X_train_partial = pd.DataFrame()
y_train_partial = pd.Series(dtype="float64")

# stream data and train incrementally
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    X = chunk.drop('target', axis=1)
    y = chunk['target']

    X_scaled = scaler.fit_transform(X) if first_chunk else scaler.transform(X)

    # split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=42)

    # incrementally train the model
    X_train_partial = pd.concat([X_train_partial, pd.DataFrame(X_train)], ignore_index=True)
    y_train_partial = pd.concat([y_train_partial, y_train], ignore_index=True)

    model.partial_fit(X_train_partial, y_train_partial, classes=y.unique())

    # print percentage complete
    print(f'\r\033[K{100 * count // num_chunks}%', end='')

    # update state
    first_chunk = False
    count += 1

y_predicted = model.predict(X_test)
print()
print("Classification report:")
print(classification_report(y_test, y_predicted))