## Data Processing in POLARS

TIME: ~ 10s

In [1]:
import polars as pl

In [2]:
# Load the data using Polars
users = pl.read_parquet("../../0 - Data/0 - original/users.pq")
cards = pl.read_parquet("../../0 - Data/0 - original/cards.pq")
transactions = pl.read_parquet("../../0 - Data/0 - original/transactions.pq")

In [3]:
# Save the columns into sets in order to catch all unique columns
users_columns = set(users.columns)
cards_columns = set(cards.columns)
transactions_columns = set(transactions.columns)

# Filter out only the common columns
common_columns = users_columns & cards_columns & transactions_columns
common_columns = [user for user in common_columns]
print(f"Common Columns (Can be used as Key): {common_columns[0] or 'There are no common columns!'}")

Common Columns (Can be used as Key): User


In [4]:
# Merge Transactions and Cards data using the 'User' column in Transactions and 'User' column in Cards
transactions_cards = transactions.join(cards, on=common_columns[0], how='inner')

# Merge TransactionsCards and Users data using the 'User' column in TransactionCards and 'User' column in User
full_transaction_table = transactions_cards.join(users, on=common_columns[0], how='inner')

In [5]:
# Print out all the columns from the Merged Transactions table
columns = full_transaction_table.columns
num_of_columns = len(columns)


print("Merged Table INFO")
print("|")
print(f"|- Number of Columns: {num_of_columns}")
print(f"|- Number of Rows: {full_transaction_table.shape[0]}")
print("".join(["=" for _ in range(70)]))
# Iterate through the columns and print them
for i in range(0, num_of_columns, 3):
    column_group = columns[i:i + 3]
    print(f"|-- {', '.join(column_group)}")
print("".join(["=" for _ in range(70)]))

Merged Table INFO
|
|- Number of Columns: 45
|- Number of Rows: 13412376
|-- User, Card, Year
|-- Month, Day, Time
|-- Amount, Use Chip, Merchant Name
|-- Merchant City, Merchant State, Zip
|-- MCC, Errors?, Is Fraud?
|-- CARD INDEX, Card Brand, Card Type
|-- Card Number, Expires, CVV
|-- Has Chip, Cards Issued, Credit Limit
|-- Acct Open Date, Year PIN last Changed, Card on Dark Web
|-- Person, Current Age, Retirement Age
|-- Birth Year, Birth Month, Gender
|-- Address, Apartment, City
|-- State, Zipcode, Latitude
|-- Longitude, Per Capita Income - Zipcode, Yearly Income - Person
|-- Total Debt, FICO Score, Num Credit Cards


In [8]:
import os

merge_dir = "../../0 - Data/1 - merge"
if not os.path.exists(merge_dir):
    os.makedirs(merge_dir)
    
full_transaction_table.write_parquet("../../0 - Data/1 - merge/merged_transactions.pq")