In [1]:
import sys
print(sys.executable)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os
import re
import warnings

c:\Users\User\AppData\Local\pypoetry\Cache\virtualenvs\credit-card-fraud-detection-model-FZHIqfLr-py3.13\Scripts\python.exe


<div style="background-color:#ddecfc; color:#100; padding:30px; border-radius:50px; max-width:1200px; margin:left;">

# Data Exploration – Dataset Tables

In this section we will first try to explore our raw dataset files to understand 
its structure, columns, datatypes, and potential issues (missing values, duplicates, etc.), before we preform any merge.

<br><u> We have 2 tables: </u></br>
* **customers.csv**
* **credit_card_fraud.csv**

</div>

### <font color = navy > <u> Data load: </u></font>

In [None]:
transactions_df = pd.read_csv("C:/Users/User/Documents/GitHub/Credit_Card_Fraud_Detection_Model/data/credit_card_fraud.csv", index_col=0) #first column is index
customers_df = pd.read_csv("C:/Users/User/Documents/GitHub/Credit_Card_Fraud_Detection_Model/data/customers.csv", sep="|")

In [None]:
print(transactions_df.shape)
print(customers_df.shape)

<div style="background-color:#f4f9fe ; color:#200; padding:30px; border-radius:50px;max-width:1200px; margin:left;">

The transaction_df has 34 million rows. Initial loading of both tables took about 6-12 minutes. <br>
To ensure smoother experimentation and reproducibility:
- lets store the datasets as Pickle (.pkl) files for quick pull (incase of kernel resets)
</br>

</div>

In [None]:
# Save once (binary format)
transactions_df.to_pickle("transactions.pkl")
customers_df.to_pickle("customers.pkl")
warnings.filterwarnings('ignore')


In [2]:
#Quick load :
transactions_df = pd.read_pickle("transactions.pkl")
customers_df = pd.read_pickle("customers.pkl")

#### <font color='Navy'> <b><u> Functions : </b></u>

In [None]:
def compare_columns(df1, name1, df2, name2):
    """
    Compares columns between two DataFrames.
    Prints:
    1. Common columns and their data types in both datasets.
    2. Columns unique to each dataset.
    """
    set1, set2 = set(df1.columns), set(df2.columns)
    common = set1.intersection(set2)
    only_in_1 = set1 - set2
    only_in_2 = set2 - set1
    
    print("="*60)
    print("🔹 Common Columns with Data Types")
    if common:
        comparison = {
            "Column": [],
            f"{name1} dtype": [],
            f"{name2} dtype": []
        }
        for col in sorted(common):
            comparison["Column"].append(col)
            comparison[f"{name1} dtype"].append(df1[col].dtype)
            comparison[f"{name2} dtype"].append(df2[col].dtype)
        result = pd.DataFrame(comparison)
        print(result.to_string(index=False))
    else:
        print("No common columns found.")

    print("\n" + "="*60)
    print(f"🔹 Columns only in {name1}:")
    if only_in_1:
        print(sorted(list(only_in_1)))

    else:
        print("No unique columns found.")
    
    print("\n" + "="*60)
    print(f"🔹 Columns only in {name2}:")
    if only_in_2:
        print(sorted(list(only_in_2)))
    else:
        print("No unique columns found.")
    print("="*60)

    return common, only_in_1, only_in_2


In [None]:
def unique_values_report(df, name, max_rows=20):
    """
    Prints number of unique values for each column in a DataFrame.
    Shows top 'max_rows' columns sorted by uniqueness.
    """
    uniques = df.nunique().sort_values(ascending=False).reset_index()
    uniques.columns = ["Column", "Unique Values"]
    
    print(f"\n🔹 Unique Value Report for {name} (showing top {max_rows}):")
    print(uniques.head(max_rows).to_string(index=False))
    


In [None]:
def clean_punc(col):
    return (
        col.str.lower()  # lowercase
           .str.replace(r"[^a-z0-9\s]", "", regex=True)  # remove special chars
           .str.strip()  # remove leading/trailing spaces
    )



In [None]:
def column_summary(df):
    summary = pd.DataFrame({
        'count': df.shape[0],
        'nulls': df.isnull().sum(),
        'nulls%': df.isnull().mean() * 100,
        'cardinality': df.nunique(),
        'dtype': df.dtypes
    })
    return summary


In [None]:
def check_case_duplicates(df, column):
    """
    Checks if a column has values that differ only by letter case.
    Returns True if normalization (lower/upper) is recommended.
    """
    original_unique = set(df[column].dropna().unique())
    normalized_unique = set(df[column].dropna().str.lower().unique())
    
    # If sizes differ → case duplicates exist
    if len(original_unique) != len(normalized_unique):
        print(f"⚠️ Column '{column}' has case duplicates.")
        diff = len(original_unique) - len(normalized_unique)
        print(f"   → {diff} duplicate categories caused by case.")
        return True
    else:
        print(f"✅ Column '{column}' has no case duplicates.")
        return False


In [None]:
def check_column_specials(df, column, sample_size=50000, sample=5):
    """
    Checks one column for punctuation or special characters.
    Uses a sample for speed on large datasets.
    
    Parameters:
    - df: pandas DataFrame
    - column: column name (string)
    - sample_size: number of rows to sample
    - sample: number of example values to display if issues are found
    """
    # Take a sample for speed
    df_sample = df[column].dropna().sample(
        min(sample_size, df[column].dropna().shape[0]), random_state=42
    ).astype(str)
    
    pattern = re.compile(r"[^a-zA-Z0-9\s]")
    mask = df_sample.str.contains(pattern, na=False)
    count = mask.sum()
    
    if count > 0:
        print(f"⚠️ Column '{column}' has {count} rows with special characters (in sample).")
        print("   Examples:", df_sample[mask].unique()[:sample])
        return "Needs Cleaning"
    else:
        print(f"✅ Column '{column}' is clean (no special characters in sample).")
        return "Clean"


In [None]:
def add_remark(summary_table, col_name, remark):
    """
    Add a remark for a specific column in the summary_table.
    
    Parameters:
        summary_table (pd.DataFrame): The summary table with 'remark' column.
        col_name (str): Column name to mark.
        remark (str): Remark text to add.
    """
    summary_table.loc[summary_table.index == col_name, "remark"] = remark
    display(summary_table.sort_values("cardinality", ascending=False))


def status_check(summary_table):
    """
    Display all columns in summary_table that have no remark.
    Works for both empty strings and NaN values.
    
    Parameters:
        summary_table (pd.DataFrame): The summary table with 'remark' column.
    """
    unmarked = summary_table[summary_table["remark"].isna() | (summary_table["remark"] == "")]
    display(unmarked.sort_values("cardinality", ascending=False))


<div style="background-color:#ddecfc ;  color:#200; padding:30px; border-radius:50px;">

### 1️⃣ <u><b> Evaluate Data Structure </b></u>

**Objective:** Understand the composition and potential issues in both datasets.

**Actions:**
- Review **shape**, **columns**, and **sample records** to understand structure.
- Use `.info()` and `.describe()` to inspect data types and summary statistics.
- Check for:
  - **Name match** 
  - **Data type match**  
  - **Value overlap**  
  - **Uniqueness**  
  - **Nulls**  
  - **Duplicates**  

</div>

In [None]:
transactions_df.info()

In [None]:
customers_df.info()

<div style="background-color:#f4f9fe; color:#100; padding:20px; border-radius:10px;">
<font color='Navy'>

It seems that we have some type of an overlap between the two sets.
<br>
<h3><u> Pre-Merge Checks : </u></h3>
1. Col name match <br>
2. Data type match<br>
3. Uniqueness<br>
4. Value overlap<br>
5. Nulls<br>
6. Duplicates  <br>


</div>

#### <font color='Navy'>🔹Columns & Data types match check :

In [None]:
# Column Comparison - Name and Data Types:
common_cols, only_in_transactions, only_in_customers = compare_columns(
    transactions_df, "transactions", customers_df, "customers"
)

#Common columns:
print("Common columns:", common_cols) 

# Nulls check on common columns  - do we need the merge ? is the key column complete ?
print("Null counts in transactions:")
print(transactions_df[list(common_cols)].isna().sum())

print("Null counts in customers:")
print(customers_df[list(common_cols)].isna().sum())



<div style="background-color:#f4f9fe; color:#100; padding:20px; border-radius:10px;">
<font color='Navy'>
<u> Summary of comparision: </u> <br>
</br>
🔹No Nulls in both data sets ✅ <br> 
🔹16 Common features : - with matching names  ✅ -  matching Dtypes  ✅

</br>
🔹 <b> Columns only in transactions: </b> <br>
- ['amt', 'category', 'is_fraud', 'merch_lat', 'merch_long', 'merchant', 'trans_date', 'trans_num', 'trans_time', 'unix_time']
</br>

============================================================
<br>
🔹 <b> Columns only in customers </b> - <font color='red'> No unique columns found.</font>

</br>
</font>

<code>customers.csv</code> has not unique columns of its own to contribute the transaction set.<br>
At this point, merging seems pointless.<br>
</br>
but before we fully dismiss it, lets see if there is any mismatch between the common values.
lets check value overlap using the cc_num

</div>


In [None]:
'''Lets confirm that each cc_num in customers_df appears only once (one row per customer), before we check if we have full match with transaction df.'''

# Count duplicate customer IDs
dup_customers = customers_df['cc_num'].duplicated().sum()

print("Total customers:", customers_df.shape[0])
print("Unique customer IDs:", customers_df['cc_num'].nunique())
print("Duplicate customer IDs:", dup_customers)

# If there are duplicates, show a few
if dup_customers > 0:
    display(customers_df[customers_df['cc_num'].duplicated(keep=False)].head())


#### <font color='Navy'> 🔹 Value overlap :

In [None]:
trans_cards = set(transactions_df['cc_num'])
cust_cards = set(customers_df['cc_num'])

overlap = trans_cards.intersection(cust_cards)

print("Unique cc_num in transactions:", len(trans_cards))
print("Unique cc_num in customers   :", len(cust_cards))
print("Overlapping cc_num           :", len(overlap))
print("Overlap coverage in transactions: {:.2f}%".format(len(overlap) / len(trans_cards) * 100))
print("Overlap coverage in customers   : {:.2f}%".format(len(overlap) / len(cust_cards) * 100))


<font color='Navy'> all unique cc_nums are included in transactions dataset. Merging will only cause duplicates. </font>

#### <font color='Navy'> 🔹 Duplicate Rows:

In [None]:
print("Duplicate rows in transactions:", transactions_df.duplicated().sum())
# print("Duplicate rows in customers   :", customers_df.duplicated().sum())


#### <font color='Navy'> 🔹 Nulls:

In [None]:
print("Nulls in transactions_df :", transactions_df.isna().sum().sum())
# print("Nulls in customers cc_num   :", customers_df['cc_num'].isna().sum())


<div style="background-color:#f4f9fe; color:#100; padding:20px; border-radius:10px;">
<font color='Navy'>
<u> Pre-Merge Checks Summary: </u> <br>
</br>

1. Common Col Names & Dtypes - match ✅
2. Unique features - only in transaction_df, customers features ovelap fully. 📍
3. Value overlap - customers_df values incompassed by transaction_df. additional few customers included with no match.📍
5. Nulls - no nulls in both sets - ✅
5. Duplicates - no duplicate records found in both sets. ✅

</div>

Since there is no additional information given by the customer.csv, we will be working on the credict_cart_fraud.csv


In [None]:
df = transactions_df.copy()

<div style="background-color:#ddecfc ;  color:#200; padding:30px; border-radius:50px;">

### 2️⃣ Convert and Correct Data Types

**Objective:** Ensure each column is stored in the appropriate format.

**Actions:**
- Data type Corrections
- Map Object Dtypes :
    - tex
- Convert date/time fields to `datetime` (`pd.to_datetime()`).
- Convert textual categorical fields to `category` dtype.
- Drop or isolate identifiers that are unique per record.

In [None]:
df.info()

#### <font color='Navy'> 🔹Value Count </font>
- Drop features with 1 unique values , if exist.

In [None]:
unique_counts = df.nunique().sort_values()
print(unique_counts)


#### <font color='Navy'> 🔹Data type Correction - Date type

In [None]:
df['trans_date'] = pd.to_datetime(df['trans_date'], errors='coerce')
df['trans_time'] = pd.to_datetime(df['trans_time'], format='%H:%M:%S', errors='coerce')
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
# customers_df['dob'] = pd.to_datetime(customers_df['dob'], errors='coerce')


In [None]:
min_date, max_date = df['trans_date'].min(), df['trans_date'].max()
# Print the results
print(f"Min Date: {min_date}") 
print(f"Max Date: {max_date}")

In [None]:
df['trans_year'] = df['trans_date'].dt.year
df['trans_month'] = df['trans_date'].dt.month

In [None]:
df_temp = df.copy()

In [None]:
monthly_fraud_counts = df_temp.groupby(['trans_year', 'trans_month', 'is_fraud']).size()

In [None]:
display(monthly_fraud_counts.head(3))

In [None]:
result = monthly_fraud_counts.unstack(fill_value=0)
result.columns = ['Non-Fraud', 'Fraud']
display (result.head(3))

In [None]:
result['Total'] = result['Non-Fraud'] + result['Fraud']
result['Fraud Percentage'] = (result['Fraud'] / result['Total']) * 100
display(result.head(3))

In [None]:
# Pivot to have Month as index and Year as columns
result = result.reset_index()
pivot_data = result.pivot(index='trans_month', columns='trans_year', values='Fraud Percentage')

# Plot
plt.figure(figsize=(10, 6))
pivot_data.plot(marker='o', linewidth=2)
plt.title('Fraud Percentage Comparison - 2019 vs 2020', fontsize=14)
plt.xlabel('Month')
plt.ylabel('Fraud Percentage (%)')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(title='Year')
plt.show()

# Data Reduction :
- Fraud presentage and pattern seems consistent per month for 2019 and 2020.
- To improve computetional efficieny,  we can focus on 2020 data only for further exploration and model training

In [None]:
# Reduced Data Set:
df_2020 = df[df['trans_year'] == 2020].copy()

In [None]:
df_2020.to_pickle("transactions_2020.pkl")

In [None]:
df_2020 = pd.read_pickle("transactions_2020.pkl")

In [None]:
df_2020.shape

#### <font color='Navy'> 🔹Date Dtypes :

In [None]:
# --- Extract date features ---
df['trans_year'] = df['trans_date'].dt.year
df['trans_quarter'] = df['trans_date'].dt.quarter
df['trans_month'] = df['trans_date'].dt.month
df['trans_day'] = df['trans_date'].dt.day
df['trans_dayofweek'] = df['trans_date'].dt.dayofweek
df['trans_hour'] = df['trans_time'].dt.hour

# --- Derive age ---
# --- age of the customer at the time of transaction:
df['age'] = (df['trans_date'] - df['dob']).dt.days // 365

#### <font color='red'> 🔹Data Drop List Container Definition:

- Removal of original features that already split to columns like dates ( month, year, hour)


In [None]:
# Initialize a list to track columns to drop later
cols_to_drop = []

# Example: when you identify columns to drop, append them instead of removing now
cols_to_drop.append('trans_time')
cols_to_drop.append('dob')


#### <font color='Navy'> 🔹Handle Object Data Types:

- textual data convert to string
- remove special keys
- Classify to category 

In [None]:
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})
df.info()

In [None]:
df_strs =df_2020.select_dtypes(include='string')

In [None]:
df_strs.head(4)

In [None]:
summary_table = column_summary(df_strs)
display(summary_table.sort_values("cardinality", ascending=False))

- `trans_num` has high cardinality and all are unique identifiers (not really useful)
- `ssn` is unique identifier for the customers, no point removing '-', we might use it for group by.
- `street`,`city`,`state` - check for granularity and category reduction.
- `last` & `first` - consider removing.
- `job` - reduce categories.
- `category` - ?? keep
- `metchant` - clean text

In [None]:
df_strs.head(2)

In [None]:
df_strs =df.select_dtypes(include='string')

Since there is no additional information given by the customer.csv, we will be working on the credict_cart_fraud.csv
1. Duplicate rows - completed in previous section. no duplicates.
2. Null inspection
3. Check object columns
4. drop if a column has only one unique value
5. Convert an objects into viable data 
6. Clean special characters and punctioation
7. convert gender
8. drop first and last name
9. group states
10. remove street?

In [None]:
# Run on transactions_df
summary_table = column_summary(df)
display(summary_table.sort_values("nulls%", ascending=False))



## Column Summary – Key Insights

- **No Nulls**: All columns have 0 null values → no imputation required at this stage.  
- **Dataset Size**: The dataset contains ~34.6M rows → operations can be slow, so sampling may be needed for EDA.  
- **High-Cardinality (ID-like) Columns**: `ssn`, `cc_num`, `acct_num`, `trans_num` have many unique values (up to 1 per row).  
  - These are identifiers and not useful for EDA/modeling beyond grouping.  
- **Categorical Columns (low/moderate cardinality)**:  
  - `gender` (2 values), `state` (51), `category` (14), `profile` (12) → well-suited for analysis.  
- **Medium/High-Cardinality Categoricals**:  
  - `city` (~5K), `job` (~600), `merchant` (~700) → useful but may need category reduction (grouping rare values).  
- **Datetime Columns**: `dob`, `trans_date`, `trans_time` are already converted.  
  - Useful for deriving `age`, `year`, `month`, `day_of_week`.  
- **Numeric Columns**:  
  - `amt` (transaction amount), `city_pop`, `lat/long`, `merch_lat/long`.  
  - Some have very high precision and may require binning or transformation.  

**➡️ Next Step:** Focus on reducing categories, deriving new time-based features, and preparing the dataset for EDA visualizations.  


# Columns Consider dropping:

In [None]:
# Identify columns where every row has a unique value
columns_to_drop = df.columns[df.nunique() == df.shape[0]]


In [None]:
print("Columns with all unique values (to consider dropping):", list(columns_to_drop))

### paired data checker:

In [None]:
# checks whether the SSN (ssn) and credit card number (cc_num) always change together in a DataFrame (df).
def paired_data_checker(df, col1, col2):
    """
    Checks if two columns always change together in a DataFrame.
    Returns True if they always change together, False otherwise.
    """
    paired_changes = df[[col1, col2]].drop_duplicates()
    unique_col1 = df[col1].nunique()
    unique_col2 = df[col2].nunique()
    
    if len(paired_changes) == max(unique_col1, unique_col2):
        print(f"{col1} and {col2} always change together. consider dropping one of them.")
        return
    else:
        print(f"{col1} and {col2} do not always change together.")
        return

# ssn_ccnum_pairs = df[['ssn', 'cc_num']].drop_duplicates()
# if len(ssn_ccnum_pairs) == len(df[['ssn', 'cc_num']].drop_duplicates(subset=['ssn'])):
#     print("ssn and cc_num always change together. Check whether to delete the column.")
# else:
#     print("ssn and cc_num do not always change together.")
# #############  df = df.drop(columns=['ssn']) ############    

In [None]:
paired_data_checker(df, 'ssn', 'cc_num')
paired_data_checker(df, 'acct_num', 'cc_num')

## Handle objects:

In [None]:
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})
df.info()

In [None]:
# df['ssn'].head(5) --> 750-09-7342
df['ssn'] = df['ssn'].str.replace('-', '', regex=False)
df['ssn'].head(5)

###Geneder conversion

In [None]:
# Convert 'F' to 1 and 'M' to 0
df['gender'] = df['gender'].replace({'F': '1', 'M': '0'}).astype(int)
df['gender'].value_counts()



In [None]:
df_city = df['city']
df_city.value_counts()

In [None]:
df_city2 = df_city.str.upper()

In [None]:
# check_case_duplicates(df, "city")
# check_case_duplicates(df, "state")
check_case_duplicates(df, "job")

'✅ Columns have no case duplicates.'


# Handling strings columns:

1. Columns to remove - create variable that marks them ( actual removal can be later)
2. Extraction of additional data

In [None]:
df_strs =df.select_dtypes(include='string')



In [None]:
df_strs.head(3)

## 1) Cardinality Inspection:

2.1 Job Column:

In [None]:
# Run on transactions_df
summary_table = column_summary(df_strs)
display(summary_table.sort_values("cardinality", ascending=False))


# 1) Columns to remove

In [None]:
removal_columns = ["ssn", "first", "last", "street", "trans_num"]
#ssn - semsitive, no predictive power, unique per person
#first, last - no predictive power
#street - high cardinality, no predictive power
#trans_num - unique per transaction, no predictive power
summary_table.loc[summary_table.index.isin(removal_columns), "remark"] = "remove"
display(summary_table.sort_values("cardinality", ascending=False))

In [None]:
status_check(summary_table)

In [None]:
print(df_strs["job"].str.lower().str.strip().unique().tolist())

In [None]:
unique_vals = df_strs["job"].unique().tolist()
print(unique_vals)



In [None]:
def categorize_jobs(df, column, new_column="job_category"):
    """
    Categorizes jobs based on keywords in the job title.
    Adds a new column with high-level categories.
    """

    def assign_category(job):
        job = str(job).lower()  # ensure lowercase
        if any(word in job for word in ["nurse", "doctor", "surgeon", "dentist", "therapist", "pharmacist", "psychiatrist", "psychologist", "radiographer", "optician", "midwife", "paramedic", "biomedical", "oncologist", "immunologist", "pathologist", "health"]):
            return "healthcare"
        elif any(word in job for word in ["teacher", "lecturer", "professor", "educator", "education officer", "tutor", "school"]):
            return "education"
        elif any(word in job for word in ["engineer", "technician", "technologist", "architect", "surveyor", "scientist", "geologist", "chemist", "biologist", "researcher", "ecologist", "mathematician", "statistician", "physicist", "astronomer"]):
            return "science/engineering"
        elif any(word in job for word in ["lawyer", "barrister", "solicitor", "attorney", "legal", "judge", "magistrate"]):
            return "legal"
        elif any(word in job for word in ["accountant", "finance", "banker", "economist", "trader", "investment", "treasurer", "auditor", "actuary"]):
            return "finance"
        elif any(word in job for word in ["artist", "designer", "animator", "illustrator", "musician", "actor", "writer", "journalist", "editor", "photographer", "producer", "curator", "painter", "sculptor", "filmmaker"]):
            return "arts/media"
        elif any(word in job for word in ["manager", "consultant", "officer", "administrator", "coordinator", "executive", "director", "chief", "ceo", "cfo", "cio", "cto", "cmo", "coo"]):
            return "management/business"
        elif any(word in job for word in ["armed forces", "navy", "army", "air force", "military", "police", "firefighter", "security"]):
            return "public safety/military"
        elif any(word in job for word in ["agricultural", "farm", "horticulturist", "fisheries", "forester", "conservation", "ecologist", "gardener", "landscaper", "tree surgeon"]):
            return "agriculture/environment"
        elif any(word in job for word in ["it", "software", "developer", "programmer", "data scientist", "web", "computer", "cyber", "network", "systems", "applications", "ai", "machine learning"]):
            return "technology"
        else:
            return "other"

    # apply categorization
    df[new_column] = df[column].apply(assign_category)
    return df


In [None]:
df = categorize_jobs(df, "job")
df[["job", "job_category"]].head(20)


In [None]:
df_strs =df.select_dtypes(include='string')


In [None]:
add_remark(summary_table, "job", "Reduced Cardinality to job_category")

In [None]:
status_check(summary_table)



In [None]:
merchant= df_strs['merchant']
merchant.value_counts()


In [None]:
check_column_specials(df_strs, 'merchant', sample_size=10000, sample=10)

### Issues with 'merchant':
- High cardinality (many unique values)
- High granularity 
- Punctionation and special keys

In [None]:
df['merchant'] = df['merchant'].str.replace('fraud_', '').str.replace(',', '')

In [None]:
df['merchant'].value_counts()

In [None]:
df.to_pickle("df_prep_str.pkl")


In [None]:
df = pd.read_pickle("df_prep_str.pkl")

In [None]:
status_check(summary_table)


In [None]:
df['profile']

In [None]:
category_stats = (
    df.groupby('merchant')
      .agg(total_transactions=('merchant', 'count'),
           total_fraud=('is_fraud', 'sum'))
      .reset_index().sort_values(by='total_fraud', ascending=False))

print(category_stats)

Profile Clean

In [None]:
print(df["profile"].head(10))   # shows first 20 rows


In [None]:
# Remove the .json ending
df['profile'] = df['profile'].str.replace('.json', '', regex=False)

In [None]:
print(df['profile'].head(10))   # shows first 20 rows

In [None]:
df['profile'].unique()

In [None]:
df["age_group"] = df["profile"].str.split("_").str[:2].str.join("_")


In [None]:
df["age_group"].value_counts()

In [None]:
df["location_profile"] = df["profile"].str.split("_").str[-1]


In [None]:
df["location_profile"].value_counts()

In [None]:
df.to_pickle("df_prep_str.pkl")

## 2) Extraction of additional data:

In [None]:
category_stats = (
    df.groupby('merchant')
      .agg(total_transactions=('merchant', 'count'),
           total_fraud=('is_fraud', 'sum'))
      .reset_index().sort_values(by='total_fraud', ascending=False))

print(category_stats)

In [None]:
check_column_specials(df, "city")
# check_column_specials(transactions_df, "job")
# check_column_specials(transactions_df, "merchant")




# Extraction of additional data:

# Data Drops:
- Street? high grnularity
- First and Last name
- - Categorical Columns where all values are unique.
- Columns with only 1 unique value.


In [None]:

# When ready, drop all at once
# df_cleaned = df.drop(columns=cols_to_drop)