In [1]:
import pandas as pd
import numpy as np
import os

#### 1. Preparing Sample Data and Files

In [4]:
# Create a sample DataFrame
data = {
    'StudentID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Score': [85.5, 92.0, 78.8, np.nan, 88.1], # Include a missing value
    'Subject': ['Math', 'Physics', 'Math', 'Chemistry', 'Physics'],
    'EnrollmentDate': pd.to_datetime(['2023-01-15', '2023-01-10', '2023-02-01', '2022-12-20', '2023-01-10'])
}
df_sample = pd.DataFrame(data)

print("--- Sample DataFrame ---")
print(df_sample)

--- Sample DataFrame ---
   StudentID     Name  Score    Subject EnrollmentDate
0        101    Alice   85.5       Math     2023-01-15
1        102      Bob   92.0    Physics     2023-01-10
2        103  Charlie   78.8       Math     2023-02-01
3        104    David    NaN  Chemistry     2022-12-20
4        105      Eve   88.1    Physics     2023-01-10


In [5]:
# Define filenames
csv_file_path = 'sample_data.csv'
excel_file_path = 'sample_data.xlsx'
csv_output_path = 'output_data.csv'

In [8]:
# --- Create a dummy CSV file to read from ---
# (Normally, you'd have an existing file)
# index=False prevents pandas from writing the DataFrame index as a column
df_sample.to_csv(csv_file_path, index=False)

print(f"Created dummy CSV file: '{csv_file_path}'")

Created dummy CSV file: 'sample_data.csv'


#### 2. Reading Data from CSV Files
- pd.read_csv() is the primary function

In [10]:
# Basic read
df_read_basic = pd.read_csv(csv_file_path)
print("Basic read from CSV:\n", df_read_basic)

Basic read from CSV:
    StudentID     Name  Score    Subject EnrollmentDate
0        101    Alice   85.5       Math     2023-01-15
1        102      Bob   92.0    Physics     2023-01-10
2        103  Charlie   78.8       Math     2023-02-01
3        104    David    NaN  Chemistry     2022-12-20
4        105      Eve   88.1    Physics     2023-01-10


In [11]:
print("\nData types after basic read:\n", df_read_basic.dtypes)
# Note: EnrollmentDate is read as object (string) by default


Data types after basic read:
 StudentID           int64
Name               object
Score             float64
Subject            object
EnrollmentDate     object
dtype: object


In [12]:
# Import pandas and os (for file handling)
import pandas as pd
import numpy as np
import os

# --- 1. Preparing Sample Data and Files ---

# Create a sample DataFrame
data = {
    'StudentID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Score': [85.5, 92.0, 78.8, np.nan, 88.1], # Include a missing value
    'Subject': ['Math', 'Physics', 'Math', 'Chemistry', 'Physics'],
    'EnrollmentDate': pd.to_datetime(['2023-01-15', '2023-01-10', '2023-02-01', '2022-12-20', '2023-01-10'])
}
df_sample = pd.DataFrame(data)

print("--- Sample DataFrame ---")
print(df_sample)
print("-" * 30)

# Define filenames
csv_file_path = 'sample_data.csv'
excel_file_path = 'sample_data.xlsx'
csv_output_path = 'output_data.csv'

# --- Create a dummy CSV file to read from ---
# (Normally, you'd have an existing file)
# index=False prevents pandas from writing the DataFrame index as a column
df_sample.to_csv(csv_file_path, index=False)
print(f"Created dummy CSV file: '{csv_file_path}'")
print("-" * 30)


# --- 2. Reading Data from CSV Files ---
# pd.read_csv() is the primary function

print("--- Reading from CSV ---")

# Basic read
df_read_basic = pd.read_csv(csv_file_path)
print("Basic read from CSV:\n", df_read_basic)
print("\nData types after basic read:\n", df_read_basic.dtypes)
# Note: EnrollmentDate is read as object (string) by default
print("-" * 20)

# Reading with common parameters:
# - parse_dates: List of columns to attempt parsing as dates
# - index_col: Column to use as the DataFrame index
# - usecols: List of columns to read (saves memory if you only need specific columns)
# - dtype: Dictionary specifying data types for specific columns
df_read_params = pd.read_csv(
    csv_file_path,
    parse_dates=['EnrollmentDate'], # Parse the date column
    index_col='StudentID',         # Use StudentID as the index
    usecols=['StudentID', 'Name', 'Score', 'EnrollmentDate'], # Only read these columns
    dtype={'Score': np.float32} # Specify dtype for Score
)
print("Read from CSV with parameters (parse_dates, index_col, usecols, dtype):\n", df_read_params)
print("\nData types after read with parameters:\n", df_read_params.dtypes)
print("-" * 20)

# Handling files with no header or different separators
# Create a dummy file without header and with ';' separator
no_header_sep_path = 'no_header_sep.csv'
df_sample.to_csv(no_header_sep_path, index=False, header=False, sep=';')
print(f"Created dummy CSV file without header, using ';': '{no_header_sep_path}'")

df_read_no_header = pd.read_csv(
    no_header_sep_path,
    header=None, # Specify that there's no header row
    sep=';',     # Specify the separator
    names=['ID', 'StudentName', 'Grade', 'Subj', 'Date'] # Provide column names
)
print("\nRead from CSV with no header, custom separator, and assigned names:\n", df_read_no_header)
os.remove(no_header_sep_path) # Clean up dummy file
print("-" * 30)


# --- 3. Writing Data to CSV Files ---
# df.to_csv() is the primary function

print("--- Writing to CSV ---")

# Create a modified DataFrame to save
df_to_save = df_read_params.copy()
df_to_save['Score'] = df_to_save['Score'].fillna(0) # Fill missing score

# Basic save (includes index by default)
# df_to_save.to_csv('output_basic.csv')

# Save without the index, specify separator and float format
df_to_save.to_csv(
    csv_output_path,
    index=False,          # Don't write the DataFrame index as a column
    sep=',',              # Use comma as separator (default)
    float_format='%.2f',  # Format floating point numbers to 2 decimal places
    date_format='%Y-%m-%d' # Specify date format
)
print(f"Saved DataFrame to '{csv_output_path}' without index, specific float/date format.")

# Verify the saved file content (optional)
# with open(csv_output_path, 'r') as f:
#     print("\nContent of saved file:")
#     print(f.read())
print("-" * 30)


# --- 4. Reading and Writing Excel Files ---
# Requires additional libraries: 'openpyxl' (for .xlsx) or 'xlrd' (for older .xls)
# You might need to install them: pip install openpyxl xlrd

print("--- Reading/Writing Excel (requires openpyxl/xlrd) ---")

# Writing to Excel
try:
    df_sample.to_excel(
        excel_file_path,
        sheet_name='StudentData', # Specify sheet name
        index=False             # Don't write index
    )
    print(f"Saved DataFrame to Excel file: '{excel_file_path}'")

    # Reading from Excel
    df_read_excel = pd.read_excel(
        excel_file_path,
        sheet_name='StudentData', # Specify sheet name to read
        index_col=None,         # Don't use any column as index initially
        parse_dates=['EnrollmentDate']
    )
    print("\nRead DataFrame from Excel:\n", df_read_excel)

except ImportError:
    print("\nSkipping Excel examples: 'openpyxl' or 'xlrd' not installed.")
    print("Install using: pip install openpyxl xlrd")
print("-" * 30)


# --- 5. Other Formats (Brief Mention) ---
# Pandas supports many other formats:
# - JSON: pd.read_json(), df.to_json()
# - SQL: pd.read_sql(), df.to_sql() (requires SQLAlchemy)
# - HTML: pd.read_html() (reads tables from HTML pages)
# - Parquet: pd.read_parquet(), df.to_parquet() (efficient columnar storage, requires pyarrow or fastparquet)
# - Feather: pd.read_feather(), df.to_feather() (fast binary format, requires pyarrow)
# - HDF5: pd.read_hdf(), df.to_hdf() (hierarchical data format, requires tables)
# - Clipboard: pd.read_clipboard(), df.to_clipboard() (read/write from system clipboard)

print("Pandas supports many other I/O formats like JSON, SQL, HTML, Parquet, etc.")
print("-" * 30)


# --- Clean up created files ---
print("--- Cleaning up created files ---")
try:
    if os.path.exists(csv_file_path):
        os.remove(csv_file_path)
        print(f"Removed '{csv_file_path}'")
    if os.path.exists(excel_file_path):
        os.remove(excel_file_path)
        print(f"Removed '{excel_file_path}'")
    if os.path.exists(csv_output_path):
        os.remove(csv_output_path)
        print(f"Removed '{csv_output_path}'")
except OSError as e:
    print(f"Error removing files: {e}")
print("-" * 30)



--- Sample DataFrame ---
   StudentID     Name  Score    Subject EnrollmentDate
0        101    Alice   85.5       Math     2023-01-15
1        102      Bob   92.0    Physics     2023-01-10
2        103  Charlie   78.8       Math     2023-02-01
3        104    David    NaN  Chemistry     2022-12-20
4        105      Eve   88.1    Physics     2023-01-10
------------------------------
Created dummy CSV file: 'sample_data.csv'
------------------------------
--- Reading from CSV ---
Basic read from CSV:
    StudentID     Name  Score    Subject EnrollmentDate
0        101    Alice   85.5       Math     2023-01-15
1        102      Bob   92.0    Physics     2023-01-10
2        103  Charlie   78.8       Math     2023-02-01
3        104    David    NaN  Chemistry     2022-12-20
4        105      Eve   88.1    Physics     2023-01-10

Data types after basic read:
 StudentID           int64
Name               object
Score             float64
Subject            object
EnrollmentDate     object
dty