## Data Processing in PANDAS

TIME: ~ 47s

In [2]:
import pandas as pd

In [3]:
# Load the data
users = pd.read_parquet("../../0 - Data/0 - original/users.pq", engine="pyarrow")
cards = pd.read_parquet("../../0 - Data/0 - original/cards.pq", engine="pyarrow")
transactions = pd.read_parquet("../../0 - Data/0 - original/transactions.pq", engine="pyarrow")

In [3]:
# Save the columns into sets in order to catch all unique columns
users_columns = set(users.columns)
cards_columns = set(cards.columns)
transactions_columns = set(transactions.columns)

# Filter out only the common columns
common_columns =users_columns & cards_columns & transactions_columns
common_columns = [user for user in common_columns]
print(f"Common Columns (Can be used as Key): {common_columns[0] or 'There are no common columns!'}")

Common Columns (Can be used as Key): User


In [4]:
# Merge Transactions and Cards data using the 'User' column in Transactions and 'User' column in Cards
transactions_cards = pd.merge(transactions, cards, left_on=common_columns[0], right_on='User', how='inner')

# Merge TransactionsCards and Users data using the 'User' column in TransactionCards and 'User' column in User
full_transaction_table = pd.merge(transactions_cards, users, left_on=common_columns[0], right_on='User', how='inner')

In [5]:
# Print out all the columns from the Merged Transactions table
columns = full_transaction_table.columns
num_of_columns = len(columns)

print("Merged Table INFO")
print("|")
print(f"|- Number of Columns: {num_of_columns}")
print(f"|- Number of Rows: {len(full_transaction_table)}")
print("".join(["=" for i in range(70)]))

# Iterate through the columns and print them in groups of three
for i in range(0, num_of_columns, 3):
    # Slice the columns to avoid IndexError
    column_group = columns[i:i + 3]
    # Join the column names with a comma
    print(f"|-- {', '.join(column_group)}")
print("".join(["=" for i in range(70)]))

Merged Table INFO
|
|- Number of Columns: 45
|- Number of Rows: 13412376
|-- User, Card, Year
|-- Month, Day, Time
|-- Amount, Use Chip, Merchant Name
|-- Merchant City, Merchant State, Zip
|-- MCC, Errors?, Is Fraud?
|-- CARD INDEX, Card Brand, Card Type
|-- Card Number, Expires, CVV
|-- Has Chip, Cards Issued, Credit Limit
|-- Acct Open Date, Year PIN last Changed, Card on Dark Web
|-- Person, Current Age, Retirement Age
|-- Birth Year, Birth Month, Gender
|-- Address, Apartment, City
|-- State, Zipcode, Latitude
|-- Longitude, Per Capita Income - Zipcode, Yearly Income - Person
|-- Total Debt, FICO Score, Num Credit Cards


In [7]:
import os

# Save the Merged Table into a data storage
merge_dir = "../../0 - Data/1 - merge"
if not os.path.exists(merge_dir):
    os.makedirs(merge_dir)

full_transaction_table.to_parquet("../../0 - Data/1 - merge/merged_transactions.pq")