In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/telecom/data2.csv
/kaggle/input/telecom/data1.csv


In [2]:
import pandas as pd
import time
import os
import psutil

file_path = "/kaggle/input/telecom/data1.csv"

# حجم الملف
file_size_bytes = os.path.getsize(file_path)
file_size_gb = file_size_bytes / (1024 ** 3)
print(f"File size on disk: {file_size_gb:.2f} GB\n")

process = psutil.Process(os.getpid())
start_time = time.time()
max_memory = 0

chunksize = 100000
total_rows = 0

for chunk in pd.read_csv(file_path, chunksize=chunksize):
    total_rows += len(chunk)
    current = process.memory_info().rss / (1024 * 1024)  # MB
    if current > max_memory:
        max_memory = current

elapsed = time.time() - start_time

print("Pandas + chunksize completed successfully")
print(f"Total rows read: {total_rows:,}")
print(f"Time taken: {elapsed:.2f} seconds")
print(f"Peak memory used: {max_memory:.2f} MB")
print(f"File size on disk: {file_size_gb:.2f} GB")

File size on disk: 9.61 GB

Pandas + chunksize completed successfully
Total rows read: 160,108,003
Time taken: 238.89 seconds
Peak memory used: 216.14 MB
File size on disk: 9.61 GB


In [3]:
!pip install dask



In [4]:
import dask.dataframe as dd
import time
import psutil
import os

file_path = "/kaggle/input/telecom/data1.csv"


file_size_bytes = os.path.getsize(file_path)
file_size_gb = file_size_bytes / (1024 ** 3)
print(f"File size on disk: {file_size_gb:.2f} GB\n")

process = psutil.Process(os.getpid())
start_time = time.time()
max_memory = 0


df = dd.read_csv(file_path)


total_rows = len(df) 
total_rows = df.shape[0].compute()  


current_memory = process.memory_info().rss / (1024 * 1024)
if current_memory > max_memory:
    max_memory = current_memory

elapsed = time.time() - start_time

print("Dask completed successfully")
print(f"Total rows read: {total_rows:,}")
print(f"Time taken: {elapsed:.2f} seconds")
print(f"Peak memory used: {max_memory:.2f} MB")
print(f"File size on disk: {file_size_gb:.2f} GB")

File size on disk: 9.61 GB

Dask completed successfully
Total rows read: 160,108,003
Time taken: 142.17 seconds
Peak memory used: 943.70 MB
File size on disk: 9.61 GB


In [5]:
import pandas as pd
import psutil
import time
import os
import gc

file_path = "/kaggle/input/telecom/data1.csv"

compressed_path = "/kaggle/working/data1_compressed.csv.gz"

gc.collect()
start_time = time.time()
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss / (1024 ** 2)


df = pd.read_csv(file_path)
df.to_csv(compressed_path, index=False, compression='gzip')

mem_after = process.memory_info().rss / (1024 ** 2)
peak_memory = mem_after - mem_before
time_taken = time.time() - start_time

file_size_gb = os.path.getsize(compressed_path) / (1024 ** 3)

print("Compression (gzip) completed successfully")
print(f"Total rows read: {len(df)}")
print(f"Time taken (read + compress): {time_taken:.2f} seconds")
print(f"Peak memory used: {peak_memory:.2f} MB")
print(f"Compressed file size: {file_size_gb:.2f} GB")

Compression (gzip) completed successfully
Total rows read: 160108003
Time taken (read + compress): 2764.17 seconds
Peak memory used: 9701.80 MB
Compressed file size: 2.41 GB


In [6]:
import pandas as pd

results = {
    "Method": ["Pandas + chunksize", "Dask", "Compression (gzip)"],
    "Total Rows": [160108003, 160108003, 160108003],
    "Time (s)": [163.53, 171.49, 2975.66],
    "Peak Memory (MB)": [287.64, 963.24, 9770.67],
    "File Size (GB)": [9.61, 9.61, 2.41]
}

df_results = pd.DataFrame(results)

print("Comparison of Large CSV Handling Methods\n")
display(df_results)

Comparison of Large CSV Handling Methods



Unnamed: 0,Method,Total Rows,Time (s),Peak Memory (MB),File Size (GB)
0,Pandas + chunksize,160108003,163.53,287.64,9.61
1,Dask,160108003,171.49,963.24,9.61
2,Compression (gzip),160108003,2975.66,9770.67,2.41
