# DATA 304 – Module 3, Session 1 Demo
Flat files, paths, CSV/Excel, compression, and large-file strategies.

## Raw strings

In [1]:
my_path = "C:\Users\data\file.cs"
print(my_path)

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (3055514616.py, line 1)

In [2]:
my_path_fixed = "C:\\Users\\data\\file.cs"
print(my_path_fixed)

C:\Users\data\file.cs


In [3]:
my_path_raw = r"C:\Users\data\file.cs"
print(my_path_raw)

C:\Users\data\file.cs


## Paths with `pathlib`
- Use relative paths for portability
- Avoid hard-coded OS-specific separators
- Prefer `Path` arithmetic

In [4]:
from pathlib import Path
ROOT_DIR = Path("..")
CUR_DIR1 = ROOT_DIR / "Module03"
CUR_DIR2 = Path(".")
DATA_DIR = CUR_DIR1 / "data"

In [5]:
print("Root directory is:\t", ROOT_DIR)
print("Current directory is:\t", CUR_DIR1)
print("Current directory is:\t", CUR_DIR2)
print("Data directory is:\t", DATA_DIR)

Root directory is:	 ..
Current directory is:	 ../Module03
Current directory is:	 .
Data directory is:	 ../Module03/data


In [6]:
print(type(ROOT_DIR))
print(type(CUR_DIR1))
print(type(CUR_DIR2))
print(type(DATA_DIR))

<class 'pathlib.PosixPath'>
<class 'pathlib.PosixPath'>
<class 'pathlib.PosixPath'>
<class 'pathlib.PosixPath'>


In [7]:
print("Root directory is:\t", ROOT_DIR.resolve())
print("Current directory is:\t", CUR_DIR1.resolve())
print("Current directory is:\t", CUR_DIR2.resolve())
print("Data directory is:\t", DATA_DIR.resolve())

Root directory is:	 /workspaces/examples
Current directory is:	 /workspaces/examples/Module03
Current directory is:	 /workspaces/examples/Module03
Data directory is:	 /workspaces/examples/Module03/data


In [8]:
# Inspect files
list(CUR_DIR1.iterdir())

[PosixPath('../Module03/session1_activity.ipynb'),
 PosixPath('../Module03/session2_demo.ipynb'),
 PosixPath('../Module03/data'),
 PosixPath('../Module03/session1_demo.ipynb'),
 PosixPath('../Module03/session2_activity.ipynb')]

## Reading a clean CSV

In [9]:
import pandas as pd
clean_path = DATA_DIR / "clean_sales.csv"
df_clean = pd.read_csv(clean_path)
df_clean.head()

Unnamed: 0,order_id,city,amount,date
0,1,Knoxville,19.99,2025-01-05
1,2,Nashville,5.25,2025-01-06
2,3,Memphis,8.75,2025-01-07
3,4,Chattanooga,12.4,2025-01-08
4,5,Knoxville,15.6,2025-01-09


In [10]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   order_id  5 non-null      int64  
 1   city      5 non-null      object 
 2   amount    5 non-null      float64
 3   date      5 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 292.0+ bytes


## CSV with semicolon delimiter and European decimals
- Use `sep=';'`
- Replace comma decimals and coerce to numeric
- Map NA tokens

In [11]:
messy_path = DATA_DIR / "messy_semicolon.csv"
df_messy = pd.read_csv(messy_path)
df_messy.head()

Unnamed: 0,order_id;city;amount;date
1;Knoxville;19,99;2025-01-05
2;Nashville;NA;2025-01-06,
3;Memphis;8,75;2025-01-07
4;Chattanooga;--;2025-01-08,


In [12]:
! head data/messy_semicolon.csv

order_id;city;amount;date
1;Knoxville;19,99;2025-01-05
2;Nashville;NA;2025-01-06
3;Memphis;8,75;2025-01-07
4;Chattanooga;--;2025-01-08


In [13]:
df_messy = pd.read_csv(messy_path, sep=';', na_values=['NA','--'])
df_messy.head()

Unnamed: 0,order_id,city,amount,date
0,1,Knoxville,1999.0,2025-01-05
1,2,Nashville,,2025-01-06
2,3,Memphis,875.0,2025-01-07
3,4,Chattanooga,,2025-01-08


In [14]:
df_messy.dtypes

order_id     int64
city        object
amount      object
date        object
dtype: object

In [15]:
# Convert 'amount' from '19,99' style to float
df_messy['amount'] = (
    df_messy['amount']
      .astype(str)
      .str.replace(',', '.', regex=False)
      .astype(float)
)
df_messy

Unnamed: 0,order_id,city,amount,date
0,1,Knoxville,19.99,2025-01-05
1,2,Nashville,,2025-01-06
2,3,Memphis,8.75,2025-01-07
3,4,Chattanooga,,2025-01-08


In [16]:
df_messy.dtypes

order_id      int64
city         object
amount      float64
date         object
dtype: object

## Quoting and multiline fields
- Use `quotechar` and let pandas handle embedded commas and newlines

In [17]:
try:
    quoted_path = DATA_DIR / "multiline_quotes.csv"
    df_quotes = pd.read_csv(quoted_path)
except Exception as e:
    print("Error:", e)

Error: Error tokenizing data. C error: Expected 3 fields in line 3, saw 4



In [18]:
! head data/multiline_quotes.csv

This is a file for testing.
order_id,city,notes
1,'New York, USA','Line1
Line2 with comma, inside quotes'
2,'Paris, FR','Contains "quoted" text'


In [19]:
df_quotes = pd.read_csv(quoted_path, skiprows=1)
df_quotes

Unnamed: 0,order_id,city,notes
1,'New York,USA','Line1
Line2 with comma,inside quotes',,
2,'Paris,FR',"'Contains ""quoted"" text'"


In [20]:
df_quotes = pd.read_csv(quoted_path, skiprows=1, quotechar="'")
df_quotes

Unnamed: 0,order_id,city,notes
0,1,"New York, USA","Line1\nLine2 with comma, inside quotes"
1,2,"Paris, FR","Contains ""quoted"" text"


## Excel with multiple sheets and junk rows
- Identify sheets
- Skip metadata rows
- Fix headers if needed

In [21]:
xls_path = DATA_DIR / "report.xlsx"
xe = pd.ExcelFile(xls_path)
xe.sheet_names

['Summary', 'RawData']

In [22]:
df_summary = xe.parse("Summary")
df_summary

Unnamed: 0,Report generated 2025-01-10,Unnamed: 1,Unnamed: 2
0,Department: Sales,,
1,,,
2,id,region,revenue
3,101,East,1000.5
4,102,West,850.3
5,103,South,920.15


In [23]:
df_summary = xe.parse("Summary", skiprows=3)
df_summary

Unnamed: 0,id,region,revenue
0,101,East,1000.5
1,102,West,850.3
2,103,South,920.15


In [24]:
# Read 'Summary' sheet, skipping top 3 junk rows
df_summary = pd.read_excel(xls_path, sheet_name="Summary", skiprows=3)
df_summary

Unnamed: 0,id,region,revenue
0,101,East,1000.5
1,102,West,850.3
2,103,South,920.15


In [25]:
df_raw = xe.parse(1)
df_raw

Unnamed: 0,id,state,amount
0,201,TN,20.5
1,202,GA,25.0
2,203,AL,18.75


## Reading compressed CSV (gzip)
- Read directly from `.csv.gz` without extracting

In [26]:
! head data/events.csv.gz

%T�5P�-�(C�C�R4�;�0D�~�2E����<!ႂ4D���ʣ+���s���s;��Z>w�B���4����C��
,d4J����N��  �� �W�	�   

In [27]:
! gzcat data/events.csv.gz 

/bin/bash: line 1: gzcat: command not found


In [28]:
gz_path = DATA_DIR / "events.csv.gz"
df_gz = pd.read_csv(gz_path, compression='gzip')
df_gz.head()

Unnamed: 0,event_id,name,attendees
0,1,event_1,89
1,2,event_2,100
2,3,event_3,126
3,4,event_4,32
4,5,event_5,61


## Large file strategy with `chunksize`
- Stream rows in chunks
- Filter early to reduce memory
- Aggregate incrementally

In [29]:
! wc -l data/large_synthetic.csv

200001 data/large_synthetic.csv


In [30]:
large_path = DATA_DIR / "large_synthetic.csv"
iter_chunks = pd.read_csv(large_path, chunksize=30000)
type(iter_chunks)

pandas.io.parsers.readers.TextFileReader

In [31]:
for chunk in iter_chunks:
    df = chunk
    print(df.shape)

(30000, 3)
(30000, 3)
(30000, 3)
(30000, 3)
(30000, 3)
(30000, 3)
(20000, 3)


In [32]:
for chunk in iter_chunks:
    df = chunk[chunk["flag"] == "A"]
    print(df.shape)

In [33]:
iter_chunks = pd.read_csv(large_path, chunksize=30000)
for chunk in iter_chunks:
    df = chunk[chunk["flag"] == "A"]
    print(df.shape)

(10153, 3)
(10010, 3)
(10001, 3)
(9889, 3)
(10074, 3)
(10014, 3)
(6776, 3)


In [34]:
iter_chunks = pd.read_csv(large_path, chunksize=30000)
df = pd.DataFrame()
for chunk in iter_chunks:
    df = pd.concat([df, chunk[chunk["flag"] == "A"]])
    print(df.shape)

(10153, 3)
(20163, 3)
(30164, 3)
(40053, 3)
(50127, 3)
(60141, 3)
(66917, 3)


## Memory inspection and dtype optimization
- Identify heavy columns
- Downcast numeric types
- Convert repeated strings to `category`

In [35]:
df_sample = pd.read_csv(large_path)
df_sample.memory_usage(deep=True)

Index           132
user_id     1600000
value       1600000
flag       10000000
dtype: int64

In [36]:
# Optimize
df_opt = df_sample.copy()
df_opt['flag'] = df_opt['flag'].astype('category')
df_opt.memory_usage(deep=True)

Index          132
user_id    1600000
value      1600000
flag        200258
dtype: int64

In [37]:
df_opt['user_id'] = pd.to_numeric(df_opt['user_id'], downcast='unsigned')
df_opt.memory_usage(deep=True)

Index          132
user_id     400000
value      1600000
flag        200258
dtype: int64

In [38]:
df_opt['value'] = pd.to_numeric(df_opt['value'], downcast='float')
df_opt.memory_usage(deep=True)

Index         132
user_id    400000
value      800000
flag       200258
dtype: int64

In [39]:
before = df_sample.memory_usage(deep=True).sum() / (1024**2)
after = df_opt.memory_usage(deep=True).sum() / (1024**2)
reduction = round(100*(1-(after/before)),2)
print(f"Size before optimization:\t{before:.2f} MB")
print(f"Size after optimization:\t{after:.2f} MB")
print(f"Reduction in size: \t\t{reduction:.2f}%")

Size before optimization:	12.59 MB
Size after optimization:	1.34 MB
Reduction in size: 		89.39%
