## Data Types Check

In [None]:
import polars as pl

# Read the data from directory
df = pl.read_parquet("../0 - Data/1 - merge/merged_transactions.pq")
df.head()

#### Combine `Year`, `Month`, `Day`, and `Time` into a single `Datetime` column

In [2]:
df = df.with_columns([
    # Ensure 'Year', 'Month', 'Day' are numeric first
    pl.col('Year').cast(pl.Int32).alias('Year'),
    pl.col('Month').cast(pl.Int32).alias('Month'),
    pl.col('Day').cast(pl.Int32).alias('Day'),
    
    # Ensure 'Time' is string for concatenation
    pl.col('Time').cast(pl.Utf8).alias('Time'),

    # Create a new 'Datetime_str' column by concatenating 'Year', 'Month', 'Day', and 'Time' as a string
    pl.concat_str(
        [pl.col("Year").cast(pl.Utf8), 
         pl.col("Month").cast(pl.Utf8).str.zfill(2), 
         pl.col("Day").cast(pl.Utf8).str.zfill(2), 
         pl.col("Time")],
        separator="-"
    ).alias("Datetime_str")
])

# Parse the concatenated 'Datetime_str' into a proper Datetime column
df = df.with_columns([
    pl.col("Datetime_str").str.strptime(pl.Datetime, format="%Y-%m-%d-%H:%M").alias("Datetime")
])

# Drop unnecessary columns: Time and temporary Datetime_str
df = df.drop(["Time", "Datetime_str"])

#### Handle Rest of the Date Columns

- Expire Column
- Acc Open Date

In [3]:
# Convert to date
df = df.with_columns(
        (
            pl.col("Expires")
            .str.strptime(pl.Date, format="%m/%Y", strict=False)
            .alias("Expires")
        )
    )

# Convert to date
df = df.with_columns(
        (
            pl.col("Acct Open Date")
            .str.strptime(pl.Date, format="%m/%Y", strict=False)
            .alias("Acct Open Date")
        )
    )

#### Handle Financial Columns

- Amount
- Credit Limit
- Yearly Incom - Person
- Total Debt
- Per Capita Income - Zipcode

In [4]:
import polars as pl

# Define the financial columns that need cleaning
financial_columns = [
    "Amount", "Credit Limit", "Yearly Income - Person", 
    "Total Debt", "Per Capita Income - Zipcode"
]

# Apply the transformations in a loop
df = df.with_columns([
    # Clean each financial column in the list
    pl.when(pl.col(col).is_not_null())
      .then(pl.col(col).str.replace('$', '', literal=True))
      .otherwise(None)
      .cast(pl.Float64)
      .alias(col)
    for col in financial_columns
] + [
    # Cast other columns directly
    pl.col("FICO Score").cast(pl.Int64).alias("FICO Score"),
    pl.col("Num Credit Cards").cast(pl.Int64).alias("Num Credit Cards")
])

#### Handle Boolean Columns

- Is Fraud?
- Has Chip
- Card on Dark Web

In [5]:
# Convert columns 'Is Fraud?', 'Has Chip', 'Card on Dark Web' to boolean
df = df.with_columns([
    pl.when(pl.col("Is Fraud?") == "Yes").then(1).otherwise(0).alias("Is Fraud"),
    pl.when(pl.col("Has Chip") == "YES").then(1).otherwise(0).alias("Has Chip"),
    pl.when(pl.col("Card on Dark Web") == "Yes").then(1).otherwise(0).alias("Card on Dark Web")
])

df = df.drop([ "Is Fraud?" ])

#### Categorical Columns (UTF8)

- Merchant Name
- Card Type
- Card Brand
- Merchant City
- Merchant State
- Zip
- Use Chip

In [6]:
# Ensure categorical or string columns are UTF8 type
df = df.with_columns([
    pl.col("Merchant Name").cast(pl.Utf8).alias("Merchant Name"),
    pl.col("Card Brand").cast(pl.Utf8).alias("Card Brand"),
    pl.col("Card Type").cast(pl.Utf8).alias("Card Type"),
    pl.col("Merchant City").cast(pl.Utf8).alias("Merchant City"),
    pl.col("Merchant State").cast(pl.Utf8).alias("Merchant State"),
    pl.col("Zip").cast(pl.Utf8).alias("Zip"),
    pl.col("Use Chip").cast(pl.Utf8).alias("Use Chip")
])

#### Handle Rest of the Numerical Columns (Float)

In [7]:
# Ensure 'Latitude' and 'Longitude' are float type
df = df.with_columns([
    pl.col("Latitude").cast(pl.Float64).alias("Latitude"),
    pl.col("Longitude").cast(pl.Float64).alias("Longitude")
])

#### Handle Null Values

In [None]:
# Find all the nulls within the data
def check_for_null_cols(df: pl.DataFrame):
    null_counts = df.null_count()
    null_cols = []
    for idx, col in enumerate(null_counts):
        if col[0] > 0:
            null_cols.append(null_counts.columns[idx])

    return null_cols

print("Columns with NULL values:", check_for_null_cols(df))

In [9]:
# Fill null values with appropriate defaults
df = df.with_columns([
    pl.col("Zip").fill_null("Unknown"),
    pl.col("Errors?").fill_null(0),
    pl.col("Merchant State").fill_null("Unknown"),
    pl.col("Apartment").fill_null("Unknown")
])

In [None]:
# Just to be sure
print("Columns with NULL values:", check_for_null_cols(df))

#### Type Checked Data

In [None]:
df.head()

#### Save the cleaned data

In [17]:
import os

clean_dir = "../0 - Data/2 - clean"
if not os.path.exists(clean_dir):
    os.makedirs(clean_dir)

# Type checked data but with full columns (For further data exploration)
df.write_parquet("../0 - Data/2 - clean/clean_transactions.pq")