In [1]:
# Mount Google Drive to access large files
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
!pip install modin[dask]
!pip install modin[ray]


Collecting modin[dask]
  Downloading modin-0.31.0-py3-none-any.whl.metadata (17 kB)
Collecting pandas<2.3,>=2.2 (from modin[dask])
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading modin-0.31.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas, modin
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.4
    Uninstalling pandas-2.1.4:
      Successfully uninstalled pandas-2.1.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflic

In [3]:
# Path to the large CSV file
file_path = '/content/drive/MyDrive/2019-Oct.csv'


In [4]:
import pandas as pd
import time

start_time = time.time()
df_pandas = pd.read_csv(file_path)
pandas_time = time.time() - start_time

print(f"Pandas time: {pandas_time:.2f} seconds")


Pandas time: 144.85 seconds


In [5]:
import dask.dataframe as dd

start_time = time.time()
df_dask = dd.read_csv(file_path)
dask_time = time.time() - start_time

print(f"Dask time: {dask_time:.2f} seconds")



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Dask time: 0.73 seconds


In [7]:
import ray
import ray.data

ray.init(ignore_reinit_error=True)

start_time = time.time()
df_ray = ray.data.read_csv(file_path)
ray_time = time.time() - start_time

print(f"Ray time: {ray_time:.2f} seconds")


2024-08-11 13:19:27,396	INFO worker.py:1614 -- Calling ray.init() again after it has already been called.


Ray time: 3.18 seconds


In [8]:


# Remove special characters and white spaces from column names
df_pandas.columns = df_pandas.columns.str.replace('[^a-zA-Z0-9]', '_', regex=True).str.strip()


In [9]:
import yaml

# Create YAML content
yaml_content = {'columns': list(df_pandas.columns)}

# Write YAML to file
with open('/content/columns.yaml', 'w') as yaml_file:
    yaml.dump(yaml_content, yaml_file)


In [10]:
# Read YAML file
with open('/content/columns.yaml', 'r') as yaml_file:
    yaml_data = yaml.safe_load(yaml_file)

# Validate columns
yaml_columns = yaml_data['columns']
file_columns = list(df_pandas.columns)

assert yaml_columns == file_columns, "Column names do not match YAML"
print("Column names validated successfully.")


Column names validated successfully.


In [11]:
# Write to a pipe-separated file in gz format
pipe_file_path = '/content/large_file_pipe.gz'
df_pandas.to_csv(pipe_file_path, sep='|', compression='gzip', index=False)


In [12]:
# Summary
total_rows = df_pandas.shape[0]
total_columns = df_pandas.shape[1]
file_size = df_pandas.memory_usage(deep=True).sum()

print(f"Total number of rows: {total_rows}")
print(f"Total number of columns: {total_columns}")
print(f"File size (in bytes): {file_size}")


Total number of rows: 42448764
Total number of columns: 9
File size (in bytes): 16508378723
