In [1]:
import pandas as pd

In [2]:
# Read the file using pandas
data = pd.read_csv(r"C:\Users\krish\Documents\DG\week6\Sales_Data\Books_rating.csv")

# Display a few rows to confirm
data.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [3]:
#We usually read files with pandas, let's see how much time does it take to load a big file..

In [4]:
#"time" library does exactly that, by giving us the execution time for a certain code block

In [5]:
import time

start_time = time.time()
data = pd.read_csv(r"C:\Users\krish\Documents\DG\week6\Sales_Data\Books_rating.csv")
print(f"Pandas read time: {time.time() - start_time:.2f} seconds")

Pandas read time: 18.91 seconds


In [6]:
#Around 20 seconds seems good, but why not try with different methods!

In [7]:
#DASK

In [8]:
#pip install "dask[complete]"   

In [9]:
import dask.dataframe as dd

start_time = time.time()
data_dask = dd.read_csv(r"C:\Users\krish\Documents\DG\week6\Sales_Data\Books_rating.csv")
print(f"Dask read time: {time.time() - start_time:.2f} seconds")

Dask read time: 0.02 seconds


In [10]:
#well that certainly looks quite fast

In [11]:
#let's try some other libraries

In [12]:
#MODIN

In [13]:
#pip install "modin[all]"

In [14]:
import modin.pandas as mpd
import ray

ray.init()

start_time = time.time()
data_modin = mpd.read_csv(r"C:\Users\krish\Documents\DG\week6\Sales_Data\Books_rating.csv")
print(f"Modin read time: {time.time() - start_time:.2f} seconds")

2024-09-12 18:11:08,360	INFO worker.py:1783 -- Started a local Ray instance.


Modin read time: 66.49 seconds


In [15]:
#That took usually longer than our traditional method, not good!

In [16]:
#RAY

In [17]:
#pip install ray

In [18]:
import ray.data

start_time = time.time()
data_ray = ray.data.read_csv(r"C:\Users\krish\Documents\DG\week6\Sales_Data\Books_rating.csv")
print(f"Ray read time: {time.time() - start_time:.2f} seconds")

Ray read time: 2.24 seconds


Pandas: Took 20.13 seconds. It loads everything into memory, which is good for medium files but can be slower for very large files.

Dask: Took just 0.02 seconds. It’s super fast because it reads data in chunks and uses lazy loading, meaning it doesn't load the file all at once.

Modin: Took 60.95 seconds. Modin tries to scale Pandas operations, but it can have overhead, so it was slower in this case.

Ray: Took 1.92 seconds. Ray is designed for distributed processing and performed very well, reading the file efficiently.

**Dask and Ray are the best for large files due to their parallel processing and chunking.**

**Pandas is okay for smaller files but slower with larger ones.**

**Modin might not always be faster due to its parallel processing overhead.**

In [19]:
#Let's work with pandas hereon, for simplicity

In [20]:
data.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [21]:
data.columns

Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')

In [22]:
# Rename the columns 
data.rename(columns={
    'User_id': 'User_ID',
    'profileName': 'User_Name',
    'review/helpfulness': 'review',
    'review/score': 'score',
    'review/time' : 'time',
    'review/summary': 'summary',
    'review/text': 'text'
}, inplace=True)

In [24]:
data.head()

Unnamed: 0,Id,Title,Price,User_ID,User_Name,review,score,time,summary,text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [25]:
data.columns = data.columns.str.replace('[^A-Za-z0-9]+', '_').str.strip()

In [28]:
print(data.columns)

Index(['Id', 'Title', 'Price', 'User_ID', 'User_Name', 'review', 'score',
       'time', 'summary', 'text'],
      dtype='object')


In [29]:
#Now to store schema we will use "YAML"

In [30]:
import yaml

In [31]:
yaml

<module 'yaml' from 'C:\\Users\\krish\\anaconda3\\envs\\notebook-6.5.7\\Lib\\site-packages\\yaml\\__init__.py'>

In [32]:
# Define the schema
schema = {
    'file': {
        'separator': '|',
        'columns': [
            'Id', 
            'Title', 
            'Price', 
            'User_ID', 
            'User_Name', 
            'review', 
            'score', 
            'time', 
            'summary', 
            'text'
        ]
    }
}

In [33]:
schema

{'file': {'separator': '|',
  'columns': ['Id',
   'Title',
   'Price',
   'User_ID',
   'User_Name',
   'review',
   'score',
   'time',
   'summary',
   'text']}}

In [34]:
# Save schema to a YAML file
with open('schema.yaml', 'w') as file:
    yaml.dump(schema, file)

In [35]:
#you can see in your directory, for schema.yaml file..

In [36]:
#Let's validate the integrity of the saved yaml file

In [37]:
# Load the YAML schema
with open('schema.yaml', 'r') as file:
    schema = yaml.safe_load(file)

In [43]:
len(data.columns) == len(schema['file']['columns'])

True

In [44]:
[col for col in data.columns if col not in schema['file']['columns']]

[]

In [None]:
#No mismatched column names

In [39]:
print(schema)

{'file': {'columns': ['Id', 'Title', 'Price', 'User_ID', 'User_Name', 'review', 'score', 'time', 'summary', 'text'], 'separator': '|'}}


In [41]:
print(data.columns)

Index(['Id', 'Title', 'Price', 'User_ID', 'User_Name', 'review', 'score',
       'time', 'summary', 'text'],
      dtype='object')


In [42]:
#Yah well everything seems correct!

In [45]:
#Write the DataFrame to a pipe-separated text file

In [46]:
output_file = 'Books_rating.txt.gz'

In [47]:
data.to_csv(output_file, sep='|', compression='gzip', index=False)

In [48]:
#Just in case

In [49]:
df = pd.read_csv(output_file, sep='|', compression='gzip')

In [50]:
df

Unnamed: 0,Id,Title,Price,User_ID,User_Name,review,score,time,summary,text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,0826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,0826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,0826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,0826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...
...,...,...,...,...,...,...,...,...,...,...
2999995,B000NSLVCU,The Idea of History,,,,14/19,4.0,937612800,Difficult,"This is an extremely difficult book to digest,..."
2999996,B000NSLVCU,The Idea of History,,A1SMUB9ASL5L9Y,jafrank,1/1,4.0,1331683200,Quite good and ahead of its time occasionally,This is pretty interesting. Collingwood seems ...
2999997,B000NSLVCU,The Idea of History,,A2AQMEKZKK5EE4,"L. L. Poulos ""Muslim Mom""",0/0,4.0,1180224000,Easier reads of those not well versed in histo...,"This is a good book but very esoteric. ""What i..."
2999998,B000NSLVCU,The Idea of History,,A18SQGYBKS852K,"Julia A. Klein ""knitting rat""",1/11,5.0,1163030400,"Yes, it is cheaper than the University Bookstore","My daughter, a freshman at Indiana University,..."


In [51]:
#Everything seems correct!

In [53]:
data.shape

(3000000, 10)

In [52]:
df.shape

(3000000, 10)

In [55]:
import os

In [59]:
# Print the summary
print(f"Total number of rows: {df.shape[0]}")
print(f"Total number of columns: {df.shape[1]}")
print(f"File size: {round(os.path.getsize(output_file) / (1024 * 1024 * 1024), 3)} GB")

Total number of rows: 3000000
Total number of columns: 10
File size: 0.986 GB
