In [11]:
import numpy as np
import pandas as pd

In [12]:
np.random.seed(42)
num_rows = 1000000

data = {
    'age': np.random.randint(18,70,size=num_rows),
    'income': np.random.normal(5000,15000,size=num_rows),
    'spending_score': np.random.uniform(0,100,size=num_rows),
    'gender': np.random.choice(['Male','Female'],size=num_rows),
    'category': np.random.choice(['A','B','C','D'],size=num_rows)
}

In [13]:
df = pd.DataFrame(data)

In [14]:
df.shape

(1000000, 5)

In [15]:
df.head()

Unnamed: 0,age,income,spending_score,gender,category
0,56,16619.384821,93.995376,Male,D
1,69,23025.890999,76.079867,Female,C
2,46,21325.489213,90.748439,Female,B
3,32,29188.039885,39.354932,Male,C
4,60,12783.907234,49.036111,Female,C


In [None]:
#!pip3 install vaex

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Defaulting to user installation because normal site-packages is not writeable
Collecting vaex
  Downloading vaex-4.17.0-py3-none-any.whl (4.8 kB)
Collecting vaex-ml<0.19,>=0.18.3
  Downloading vaex_ml-0.18.3-py3-none-any.whl (58 kB)
     |████████████████████████████████| 58 kB 2.4 MB/s             
[?25hCollecting vaex-astro<0.10,>=0.9.3
  Downloading vaex_astro-0.9.3-py3-none-any.whl (20 kB)
Collecting vaex-viz<0.6,>=0.5.4
  Downloading vaex_viz-0.5.4-py3-none-any.whl (19 kB)
Collecting vaex-core~=4.17.1
  Downloading vaex_core-4.17.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
     |████████████████████████████████| 4.6 MB 427 kB/s             ██████████████              | 2.6 MB 11.

In [None]:
# step-2: create a dataset
import numpy as np
import vaex

df = vaex.from_arrays(**data)
df.export_hdf5('large_dataset.hdf5')

In [19]:
# step-3 : load the dataset vaex

df = vaex.open('large_dataset.hdf5')

In [20]:
len(df)

1000000

In [21]:
df.get_column_names

<bound method DataFrame.get_column_names of #        age    income               spending_score      gender    category
0        56     16619.38482116408    93.9953760409572    Male      D
1        69     23025.890999498293   76.07986709281052   Female    C
2        46     21325.489212513217   90.74843876846965   Female    B
3        32     29188.0398854275     39.354931784919955  Male      C
4        60     12783.907233692862   49.036111226090505  Female    C
...      ...    ...                  ...                 ...       ...
999,995  69     2208.0867381955036   18.643440766596242  Male      D
999,996  41     -12730.151109629594  85.85668503901374   Male      D
999,997  42     -3660.941022900981   16.52091156113841   Female    B
999,998  59     -5522.546418434153   10.467759498539609  Female    C
999,999  45     -22563.10994357534   60.77988517567287   Female    A>

In [22]:
df.head()

#,age,income,spending_score,gender,category
0,56,16619.4,93.9954,Male,D
1,69,23025.9,76.0799,Female,C
2,46,21325.5,90.7484,Female,B
3,32,29188.0,39.3549,Male,C
4,60,12783.9,49.0361,Female,C
5,25,8198.12,7.85741,Female,B
6,38,-249.385,91.2016,Male,D
7,56,16661.2,31.3905,Female,B
8,36,-9742.44,42.2035,Male,B
9,40,17143.6,42.9079,Male,A


In [23]:
## step-4 Basic EDA in vaex

df.describe()

Unnamed: 0,age,income,spending_score,gender,category
data_type,int64,float64,float64,string,string
count,1000000,1000000,1000000,1000000,1000000
,0,0,0,0,0
mean,43.5034,4977.399537317712,50.00153708439174,--,--
std,15.0024,14987.4,28.8715,--,--
min,18,-65944.2,9.95806e-05,--,--
max,69,73683.7,99.9998,--,--


In [24]:
# Filter data

# Filter rows where age > 30
filtered_df = df[df['age'] > 30]

# Check the number of rows after filtering
print(f"Number of rows after filtering: {len(filtered_df)}")


Number of rows after filtering: 750567


In [26]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

# Define the model
model = SGDClassifier(random_state=42)

# Use Vaex to process data in chunks
chunk_size = 100_000
for i in range(0, len(df), chunk_size):
    chunk = df[i:i+chunk_size].to_pandas_df()
    
    X = chunk.drop(['gender', 'category'], axis=1)
    y = chunk['gender'].apply(lambda x: 1 if x == 'Male' else 0)
    
    # Train the model incrementally
    model.partial_fit(X, y, classes=[0, 1])

print("Model trained!")

# === Test the model ===
# Use the remaining 20% as the test set
test_data = df[int(len(df) * 0.8):].to_pandas_df()

X_test = test_data.drop(['gender', 'category'], axis=1)
y_test = test_data['gender'].apply(lambda x: 1 if x == 'Male' else 0)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Model trained!
Accuracy: 0.50
