In [1]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"bilalrezzoug","key":"499d927a77d3546c94e264c0e4c83dcc"}'}

In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [3]:
!pip install kaggle
!kaggle datasets download -d kimdaegyeom/5g-traffic-datasets -p /content --unzip


Dataset URL: https://www.kaggle.com/datasets/kimdaegyeom/5g-traffic-datasets
License(s): unknown
Downloading 5g-traffic-datasets.zip to /content
100% 3.20G/3.21G [00:52<00:00, 116MB/s] 
100% 3.21G/3.21G [00:52<00:00, 65.7MB/s]


# ****Method 1 ‚ÄîChunksize****

In [1]:
# --- Setup ---
!pip install pandas

# --- Imports ---
import pandas as pd
import time
import psutil  # for memory usage

# --- File path ---
file_path = r"/content/5G_Traffic_Datasets/Live_Streaming/Naver_NOW/Naver_NOW_4.csv"  # adjust if needed

# --- Memory before reading ---
process = psutil.Process()
mem_before = process.memory_info().rss / (1024 * 1024)  # in MB

# --- Pandas with chunksize ---
chunk_size = 100000
means = []
start = time.time()

for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    means.append(chunk['Length'].mean())  # using 'Length' column

pandas_mean = sum(means) / len(means)
pandas_time = time.time() - start

# --- Memory after reading ---
mem_after = process.memory_info().rss / (1024 * 1024)  # in MB
mem_used = mem_after - mem_before

# --- Results ---
print(f"‚úÖ Mean (Length): {pandas_mean}")
print(f"‚è±Ô∏è Time with Pandas (chunksize): {pandas_time:.2f} seconds")
print(f"üíæ Memory used: {mem_used:.2f} MB")


‚úÖ Mean (Length): 1019.863359121691
‚è±Ô∏è Time with Pandas (chunksize): 121.12 seconds
üíæ Memory used: 69.90 MB


# ***Method 2 ‚Äî Dask***

In [2]:
import dask.dataframe as dd
import time, psutil


process = psutil.Process()
mem_before = process.memory_info().rss / (1024*1024)

start = time.time()
df = dd.read_csv(file_path)
dask_mean = df['Length'].mean().compute()
dask_time = time.time() - start

mem_after = process.memory_info().rss / (1024*1024)
dask_mem = mem_after - mem_before

print(" Dask finished")
print(f"Mean: {dask_mean:.4f}")
print(f"Time: {dask_time:.2f} seconds")
print(f"Memory Used: {dask_mem:.2f} MB")


 Dask finished
Mean: 1019.8474
Time: 131.78 seconds
Memory Used: 231.02 MB


# **Method 3 ‚Äî Compressed CSV (gzip)**

In [2]:
import gzip, shutil, time, psutil, os
import pandas as pd # Import pandas

file_path = r"/content/5G_Traffic_Datasets/Live_Streaming/Naver_NOW/Naver_NOW_4.csv" # Define file_path

compressed_file = file_path + ".gz"

# Compress once (you already have this, skip next two lines if file exists)
with open(file_path, 'rb') as f_in:
    with gzip.open(compressed_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

process = psutil.Process()
mem_before = process.memory_info().rss / (1024*1024)

start = time.time()
chunk_size = 500000
means = []

for chunk in pd.read_csv(compressed_file, compression='gzip', chunksize=chunk_size):
    means.append(chunk['Length'].mean())

compressed_mean = sum(means) / len(means)
compressed_time = time.time() - start
mem_after = process.memory_info().rss / (1024*1024)
compressed_mem = mem_after - mem_before

original_size = os.path.getsize(file_path) / (1024*1024)
compressed_size = os.path.getsize(compressed_file) / (1024*1024)

print("‚úÖ Compressed CSV (gzip + chunksize) finished")
print(f"Mean: {compressed_mean:.4f}")
print(f"Time: {compressed_time:.2f} seconds")
print(f"Memory Used: {compressed_mem:.2f} MB")
print(f"Original Size: {original_size:.2f} MB ‚Üí Compressed Size: {compressed_size:.2f} MB")

‚úÖ Compressed CSV (gzip + chunksize) finished
Mean: 1019.9412
Time: 135.65 seconds
Memory Used: 338.21 MB
Original Size: 5789.57 MB ‚Üí Compressed Size: 344.33 MB
