In [10]:
# ✅ STEP 0: Install required libraries
!pip install modin[ray] dask pyyaml --quiet


ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy_repr.py", line 61, in get_shape
    return str(shape)
           ^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/dask_expr/_collection.py", line 4803, in __repr__
    return f"<dask_expr.expr.Scalar: expr={self.expr}, dtype={self.dtype}>"
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/dask_expr/_expr.py", line 2627, in __str__
    return f"{self.left} {self._operator_repr} {self.right}"
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/dask_expr/_reductions.py", line 826, in __str__
    base = str(self.frame)
           ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/dask_expr/_core.py", line 83, in __str__
    s = ", ".join(self._operands_for_repr())
                  ^

import pandas as pd
i

In [11]:
import pandas as pd
import modin.pandas as mpd
import dask.dataframe as dd
import yaml
import time
import os

In [12]:
# ✅ STEP 2: Use small sports predictions CSV (manually defined for simplicity)
csv_data = '''game_id,home_team,away_team,home_score,away_score,predicted_winner
1,Chiefs,Bengals,27,24,Chiefs
2,Bills,Patriots,21,17,Bills
3,49ers,Eagles,28,30,Eagles
4,Cowboys,Giants,17,13,Cowboys
5,Rams,Packers,20,23,Packers
'''

# Save to a file
with open("nfl_predictions.csv", "w") as f:
    f.write(csv_data)

file_name = "nfl_predictions.csv"


In [13]:
# ✅ STEP 3: Read the file using different methods and record timings

# --- Pandas ---
start = time.time()
df_pandas = pd.read_csv(file_name)
end = time.time()
print("[Pandas] Time taken:", end - start, "seconds")

# --- Modin ---
start = time.time()
df_modin = mpd.read_csv(file_name)
end = time.time()
print("[Modin] Time taken:", end - start, "seconds")

# --- Dask ---
start = time.time()
df_dask = dd.read_csv(file_name)
end = time.time()
print("[Dask] Time taken:", end - start, "seconds")


[Pandas] Time taken: 0.003728151321411133 seconds
[Modin] Time taken: 0.04166984558105469 seconds
[Dask] Time taken: 0.006651878356933594 seconds


In [14]:
# ✅ STEP 4: Clean column names (remove special characters, spaces)
def clean_columns(df):
    df.columns = df.columns.str.strip().str.replace(r"[^\w]", "", regex=True)
    return df

df_pandas = clean_columns(df_pandas)
df_modin = clean_columns(df_modin)
df_dask = clean_columns(df_dask)


In [15]:
# ✅ STEP 5: Save YAML schema
schema = {
    'separator': '|',
    'columns': df_pandas.columns.tolist()
}

with open("schema.yaml", "w") as f:
    yaml.dump(schema, f)


In [16]:
with open("schema.yaml") as f:
  yaml_data = yaml.safe_load(f)

assert len(df_pandas.columns) == len(yaml_data['columns']), "Column count mismatch"
assert all(df_pandas.columns == yaml_data['columns']), "Column names mismatch"
print("✅ Schema validation passed")

✅ Schema validation passed


In [18]:
# ✅ STEP 7: Write cleaned file as pipe-separated gzipped text
output_file = "output_cleaned.txt.gz"
df_pandas.to_csv(output_file, sep='|', index=False, compression='gzip')
print("✅ File written as:", output_file)


✅ File written as: output_cleaned.txt.gz


In [19]:
# ✅ STEP 8: Generate file summary
print("Total rows:", df_pandas.shape[0])
print("Total columns:", df_pandas.shape[1])
print("File size (MB):", os.path.getsize(output_file) / (1024 ** 2))


Total rows: 5
Total columns: 6
File size (MB): 0.000179290771484375


### 📝 Final Report Summary

- **File Ingested:** `nfl_predictions.csv` (simulated NFL predictions data)  
- **Columns Cleaned:** Yes (special characters & spaces removed)  
- **Libraries Used:**  
  - Pandas  
  - Modin (Ray)  
  - Dask  
- **Schema Validation:** ✅ Passed using generated `schema.yaml`  
- **Output File:** `output_cleaned.txt.gz`  
  - Format: `|`-separated  
  - Compressed: `.gz`  
- **Summary Stats:**  
  - Total Rows: 5  
  - Total Columns: 6  
  - File Size: Very small (< 0.001 MB)  
- **Performance Observations:**  
  - All libraries handled this small file instantly  
  - Pandas had the lowest overhead  
  - Modin and Dask are more useful for large datasets


In [20]:
import os

for file in os.listdir():
  print(file)

.config
output_cleaned.txt.gz
nfl_predictions.csv
schema.yaml
sample_data


In [21]:
from google.colab import files
files.download('schema.yaml')
files.download('output_cleaned.txt.gz')
files.download("nfl_predictions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>