**First of all we need to install some libraries**

In [1]:
!pip install dask
!pip install modin
!pip install PyYAML



**Let's import some libraries**

In [4]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import time

**Now it is time to check which method is good to read the file efficiently** 

In [8]:
# Pandas
start_time = time.time()
df_pandas = pd.read_csv('AB_NYC_2019.csv')
pandas_time = time.time() - start_time
print(f"Pandas read time: {pandas_time}")

Pandas read time: 0.3037374019622803


In [9]:
# Dask
start_time = time.time()
df_dask = dd.read_csv('AB_NYC_2019.csv')
dask_time = time.time() - start_time
print(f"Dask read time: {dask_time}")

Dask read time: 0.008010149002075195


In [10]:
# Modin
start_time = time.time()
df_modin = mpd.read_csv('AB_NYC_2019.csv')
modin_time = time.time() - start_time
print(f"Modin read time: {modin_time}")

Modin read time: 0.14404082298278809


In [11]:
# Now let's print out the time taken for each library to read the file
print("Read time comparison:")
print(f"Pandas: {pandas_time:.2f}s")
print(f"Dask: {dask_time:.2f}s")
print(f"Modin: {modin_time:.2f}s")

Read time comparison:
Pandas: 0.30s
Dask: 0.01s
Modin: 0.14s


### Now we can perform basic validation on data columns

In [12]:
df_pandas.columns = df_pandas.columns.str.replace('[^\w\s]','').str.strip()



**Now let's create a YAML file for the expected column names and write the column names into it**

In [13]:
#Let's import yaml
import yaml

In [14]:
expected_columns = list(df_pandas.columns)

with open('column_names.yaml', 'w') as file:
    yaml.dump(expected_columns, file)

**Now let's validate the number of columns and column names of the ingested file with the YAML file**

In [15]:
with open('column_names.yaml', 'r') as file:
    expected_columns = yaml.safe_load(file)

if len(df_pandas.columns) != len(expected_columns):
    print('The number of columns does not match the expected number')
elif not all([col in df_pandas.columns for col in expected_columns]):
    print('Some of the expected columns are missing')
else:
    print('All columns match the expected columns')

All columns match the expected columns


### Let's write the file in pipe-separated text format in GZ format using pandas

In [16]:
df_pandas.to_csv('AB_NYC_2019_pipe.gz', sep='|', compression='gzip', index=False)