# Read, Write, and Export CSV Files 
Covers loading CSV files, writing data, and exporting to CSV using Pandas.

In [114]:
import pandas as pd
# Option A: Double backslashes
penguins = pd.read_csv("D:\\python_env\\Pandas_Practise_Portfolio\\Datasets\\penguins.csv")
# Option B: Raw string
penguins = pd.read_csv(r"D:\python_env\Pandas_Practise_Portfolio\Datasets\penguins.csv")
# Option C: Forward slashes
penguins = pd.read_csv("D:/python_env/Pandas_Practise_Portfolio/Datasets/penguins.csv")

In pandas, the sep parameter specifies the delimiter that separates fields in csv file. 
1. A standad csv contains (,): column1,column2,column3
2. other delimiters jmight include tabs (\t); semicolons(;) or spaces
Delimiter is alias for sep which does same thing; we can use either but sep is preferable 

In [115]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers.readers:

read_csv(filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]', *, sep: 'str | None | lib.NoDefault' = <no_default>, delimiter: 'str | None | lib.NoDefault' = None, header: "int | Sequence[int] | None | Literal['infer']" = 'infer', names: 'Sequence[Hashable] | None | lib.NoDefault' = <no_default>, index_col: 'IndexLabel | Literal[False] | None' = None, usecols: 'UsecolsArgType' = None, dtype: 'DtypeArg | None' = None, engine: 'CSVEngine | None' = None, converters: 'Mapping[Hashable, Callable] | None' = None, true_values: 'list | None' = None, false_values: 'list | None' = None, skipinitialspace: 'bool' = False, skiprows: 'list[int] | int | Callable[[Hashable], bool] | None' = None, skipfooter: 'int' = 0, nrows: 'int | None' = None, na_values: 'Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None' = None, keep_default_na: 'bool' = True, na_filter: 'bool' = True, verbose: 'bool | 

In [116]:
    penguins = pd.read_csv(r"D:\python_env\Pandas_Practise_Portfolio\Datasets\penguins.csv",
    sep=",",
    header=0, #means first row(row 0) is treated as column names and data starts from next row. or Header="infer"
    index_col="rowid")      #exisiting column as index rather than system generated index. 


    #names=[] if we dont have column names or it is not specified. 

    penguins.head()

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,,,,,,2007
5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


import os  # python command that imports the built-in OS module which allows to interact vwith the operating system. 
1. it helps in file and directory manipulation: create, delete, rename and list files and directories. 
2. Path operations: os.path.join() can be used to construct file paths which is correct for specific OS. 

# Process Large CSV files in chunks: it can be too big to fit in memory at once so processing them in similar samller chunks helps:
1. Avoid memory errors
2. Allow streaming and incremental processing
3. let us focus on most important parts only.

In [117]:
import pandas as pd

# Path to the CSV (e.g., Palmer Penguins dataset)
data_path = '..\Datasets\penguins.csv'

# Set chunk size (100 rows per chunk; adjust for larger files)
chunk_size = 100

# Create a generator to read the CSV in chunks, avoiding loading the full file into memory
penguins_chunk_reader = pd.read_csv(data_path, chunksize=chunk_size)

# Loop through each chunk (each chunk is a DataFrame)
for penguins_chunk in penguins_chunk_reader:
    # Inspect chunk size
    print(f"Chunk shape: {penguins_chunk.shape}")
    # Example: Count rows per species in this chunk
    species_counts = penguins_chunk['species'].value_counts()
    print("Species counts in chunk:\n", species_counts)

# Note: For a small dataset (~344 rows), chunking is overkill, but this scales to millions of rows.

Chunk shape: (100, 9)
Species counts in chunk:
 species
Adelie    100
Name: count, dtype: int64
Chunk shape: (100, 9)
Species counts in chunk:
 species
Adelie    52
Gentoo    48
Name: count, dtype: int64
Chunk shape: (100, 9)
Species counts in chunk:
 species
Gentoo       76
Chinstrap    24
Name: count, dtype: int64
Chunk shape: (44, 9)
Species counts in chunk:
 species
Chinstrap    44
Name: count, dtype: int64


When we read with chunksize, pandas does not load the whole dataset into memory as one DataFrame. Instead, it loads DataFrames chunk by chunk. So in a sense, we're still working with DataFrames, just smaller ones, processing them sequentially so memory usage stays manageable

In [118]:
# Example: To calculate the average body_mass_g for each species but the file is too big too load at once. 
import pandas as pd

# Path to your large CSV
data_path="../Datasets/penguins.csv"
# Set chunk size (e.g., 100 rows per chunk)
chunk_size = 100

# Create a chunk reader (returns a generator)
penguins_chunk_reader = pd.read_csv(data_path, chunksize=chunk_size)

# Initialize a list to store results from each chunk
species_means = []

# Iterate over chunks
for chunk in penguins_chunk_reader:
    # Process chunk: Compute mean body_mass_g per species
    chunk_mean = chunk.groupby('species')['body_mass_g'].mean().reset_index()
    species_means.append(chunk_mean)

# Combine results from all chunks
final_result = pd.concat(species_means).groupby('species')['body_mass_g'].mean().reset_index()
print(final_result)

     species  body_mass_g
0     Adelie  3692.174145
1  Chinstrap  3723.674242
2     Gentoo  5068.333333


pick a reasonable chunk_size(10,000-100,000) that keeps memory usage under 1 GB. 

In [119]:
# Filtering and saving:
import pandas as pd
chunk_size=100
data_path="../Datasets/penguins.csv"
penguins_chunk_reader=pd.read_csv(data_path, chunksize=chunk_size)

# header to new CSV once.
header_written=False
for chunk in penguins_chunk_reader:
    filtered_chunk=chunk[chunk["body_mass_g"]>4000]

    # save to csv  (append mode after first chunk)
    filtered_chunk.to_csv("../Datasets/filtered_penguins.csv", mode="a", header= not header_written, index="True",index_label="rowid")
    header_written=True
print("filtered CSV saved!:")


filtered CSV saved!:


# Apply transformations on Dataframe 

In [120]:
import pandas as pd
penguins_data=pd.read_csv(data_path, sep=",")
penguins_data
penguins_data["body_mass_g"]=penguins_data["body_mass_g"].transform(lambda x:x+1000).fillna(penguins_data["body_mass_g"].mean())
penguins_data[["species","body_mass_g"]].head()

Unnamed: 0,species,body_mass_g
0,Adelie,4750.0
1,Adelie,4800.0
2,Adelie,4250.0
3,Adelie,4201.754386
4,Adelie,4450.0


In [121]:
# another Transformation with Airthmetic:
penguins_data["body_mass_kg"]=penguins_data["body_mass_g"]/1000
penguins_data.head()

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,body_mass_kg
0,1,Adelie,Torgersen,39.1,18.7,181.0,4750.0,male,2007,4.75
1,2,Adelie,Torgersen,39.5,17.4,186.0,4800.0,female,2007,4.8
2,3,Adelie,Torgersen,40.3,18.0,195.0,4250.0,female,2007,4.25
3,4,Adelie,Torgersen,,,,4201.754386,,2007,4.201754
4,5,Adelie,Torgersen,36.7,19.3,193.0,4450.0,female,2007,4.45


In [122]:
# Filtering data:
dream_penguins=penguins_data[penguins_data["island"]=="Dream"]
dream_penguins.tail()

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,body_mass_kg
339,340,Chinstrap,Dream,55.8,19.8,207.0,5000.0,male,2009,5.0
340,341,Chinstrap,Dream,43.5,18.1,202.0,4400.0,female,2009,4.4
341,342,Chinstrap,Dream,49.6,18.2,193.0,4775.0,male,2009,4.775
342,343,Chinstrap,Dream,50.8,19.0,210.0,5100.0,male,2009,5.1
343,344,Chinstrap,Dream,50.2,18.7,198.0,4775.0,female,2009,4.775


In [123]:
# simple categorization:
penguins_data["weight_category"]=penguins_data["body_mass_kg"].apply(lambda x: "Heavy" if x>4 else "light")
print(penguins_data.head())

   rowid species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0      1  Adelie  Torgersen            39.1           18.7              181.0   
1      2  Adelie  Torgersen            39.5           17.4              186.0   
2      3  Adelie  Torgersen            40.3           18.0              195.0   
3      4  Adelie  Torgersen             NaN            NaN                NaN   
4      5  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  year  body_mass_kg weight_category  
0  4750.000000    male  2007      4.750000           Heavy  
1  4800.000000  female  2007      4.800000           Heavy  
2  4250.000000  female  2007      4.250000           Heavy  
3  4201.754386     NaN  2007      4.201754           Heavy  
4  4450.000000  female  2007      4.450000           Heavy  


In [124]:
# Export processed Data From Pandas DataFrame to csv. 
penguins_data.to_csv("..\Datasets\Processed_penguins.csv", index=False, sep=",", na_rep=" ")

In [125]:
# Final notebook code.

import pandas as pd
import os  
data_path="../Datasets/penguins.csv"

# check file and format:
if os.path.exists(data_path):
    # inspect first line
    with open(data_path) as f:
        print("First line of CSV:", f.readline())
    

    #load with tutorial parameters
    try:
        penguins=pd.read_csv(data_path, sep=",",header=0, index_col="rowid")
        print("first five rows:")
        print(penguins.head())

        # export example
        adelie_penguins= penguins[penguins["species"]=="Adelie"]
        adelie_penguins.to_csv("../Datasets/adelie_penguins.csv", index=True, sep=",")
        print("Exported to .. ../Datasets/adelie_penguins.csv ")
    except KeyError as e:
        print(f"Error: {e}. check if 'rowid' exists in CSV.")
else:
    print(f"Error: {e}. check if 'rowid' exists in CSV")
    print("files in ../Datasets:", os.listdir("../datasets"))

First line of CSV: "rowid","species","island","bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g","sex","year"

first five rows:
      species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
rowid                                                                        
1      Adelie  Torgersen            39.1           18.7              181.0   
2      Adelie  Torgersen            39.5           17.4              186.0   
3      Adelie  Torgersen            40.3           18.0              195.0   
4      Adelie  Torgersen             NaN            NaN                NaN   
5      Adelie  Torgersen            36.7           19.3              193.0   

       body_mass_g     sex  year  
rowid                             
1           3750.0    male  2007  
2           3800.0  female  2007  
3           3250.0  female  2007  
4              NaN     NaN  2007  
5           3450.0  female  2007  
Exported to .. ../Datasets/adelie_penguins.csv 
