In [20]:
#!pip install kaggle
#!pip install kagglehub

Importing all 2020 data in chunks for better run time

In [23]:
import pandas as pd

CHUNKSIZE = 50000
date_column = 'trans_date'
all_2020_chunks = []

for chunk in pd.read_csv('credit_card_fraud.csv', chunksize=CHUNKSIZE, parse_dates=[date_column]):

    # Filter each chunk for the year 2020
    df_2020_chunk = chunk[chunk[date_column].dt.year == 2020]
    all_2020_chunks.append(df_2020_chunk)

df = pd.concat(all_2020_chunks, ignore_index=True)

print(f"Successfully loaded and filtered dataset. Total 2020 records: {len(df):,}")

Successfully loaded and filtered dataset. Total 2020 records: 17,292,422


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17292422 entries, 0 to 17292421
Data columns (total 27 columns):
 #   Column      Dtype         
---  ------      -----         
 0   Unnamed: 0  int64         
 1   ssn         object        
 2   cc_num      int64         
 3   first       object        
 4   last        object        
 5   gender      object        
 6   street      object        
 7   city        object        
 8   state       object        
 9   zip         int64         
 10  lat         float64       
 11  long        float64       
 12  city_pop    int64         
 13  job         object        
 14  dob         object        
 15  acct_num    int64         
 16  profile     object        
 17  trans_num   object        
 18  trans_date  datetime64[ns]
 19  trans_time  object        
 20  unix_time   int64         
 21  category    object        
 22  amt         float64       
 23  is_fraud    int64         
 24  merchant    object        
 25  merch_lat   floa

##choosing a state to focus on based on most absolute fraud cases

In [25]:
fraud_counts = df.groupby('state')['is_fraud'].sum().reset_index()
top_fraud_states = fraud_counts.sort_values(by='is_fraud', ascending=False)

print("--- Top 3 States by Absolute Fraud Count ---")
print(top_fraud_states.head(3).to_markdown(index=False))

best_state_to_keep = top_fraud_states.iloc[0]['state']
best_fraud_count = top_fraud_states.iloc[0]['is_fraud']

print(f"\nRecommendation: The state to focus on is **'{best_state_to_keep}'** with **{best_fraud_count}** fraud cases.")

--- Top 3 States by Absolute Fraud Count ---
| state   |   is_fraud |
|:--------|-----------:|
| CA      |      10894 |
| TX      |       7344 |
| NY      |       5945 |

Recommendation: The state to focus on is **'CA'** with **10894** fraud cases.


#scoping data on California

In [26]:
ca_mask = (df['state'] == 'CA')
clean_df = df[ca_mask].copy()

del df
import gc
gc.collect()



287

In [27]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2013945 entries, 0 to 17284449
Data columns (total 27 columns):
 #   Column      Dtype         
---  ------      -----         
 0   Unnamed: 0  int64         
 1   ssn         object        
 2   cc_num      int64         
 3   first       object        
 4   last        object        
 5   gender      object        
 6   street      object        
 7   city        object        
 8   state       object        
 9   zip         int64         
 10  lat         float64       
 11  long        float64       
 12  city_pop    int64         
 13  job         object        
 14  dob         object        
 15  acct_num    int64         
 16  profile     object        
 17  trans_num   object        
 18  trans_date  datetime64[ns]
 19  trans_time  object        
 20  unix_time   int64         
 21  category    object        
 22  amt         float64       
 23  is_fraud    int64         
 24  merchant    object        
 25  merch_lat   float64   

##Consolidates high cardinality features

In [28]:
def consolidate_top_categories(df: pd.DataFrame, column_name: str, top_n: int = 20) -> pd.DataFrame:
    """
    Consolidates low-frequency values in a specified column into an 'Other' category to reduce cardinality.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_name (str): The name of the column to consolidate.
        top_n (int): The number of top-most frequent categories to keep. 
                     All others will be labeled 'Other'.

    Returns:
        pd.DataFrame: The DataFrame with the specified column modified.
    """
    top_categories = df[column_name].value_counts().nlargest(top_n).index

    df[column_name] = df[column_name].where(
        df[column_name].isin(top_categories), 
        'Other'
    )
    
    # 3. Print the results for verification
    print(f"--- Value Counts for '{column_name}' after Consolidation ---")
    print(df[column_name].value_counts().to_markdown())
    print("-" * 50)

    return df



In [29]:

clean_df = consolidate_top_categories(clean_df, 'city', top_n=20)

--- Value Counts for 'city' after Consolidation ---
| city           |            count |
|:---------------|-----------------:|
| Other          |      1.40941e+06 |
| Los Angeles    | 131821           |
| San Jose       |  59422           |
| San Diego      |  46185           |
| San Francisco  |  40430           |
| Sacramento     |  35189           |
| Fresno         |  28433           |
| Riverside      |  26704           |
| Long Beach     |  25085           |
| Bakersfield    |  24839           |
| Chula Vista    |  19786           |
| Santa Ana      |  19781           |
| Oakland        |  17438           |
| Corona         |  17411           |
| Anaheim        |  17334           |
| Oxnard         |  17213           |
| San Bernardino |  17196           |
| Stockton       |  15872           |
| Glendale       |  15138           |
| Fremont        |  14937           |
| Oceanside      |  14323           |
--------------------------------------------------


In [30]:

clean_df = consolidate_top_categories(clean_df, 'merchant', top_n=20)

--- Value Counts for 'merchant' after Consolidation ---
| merchant                            |          count |
|:------------------------------------|---------------:|
| Other                               |    1.86064e+06 |
| fraud_O'Connell, Botsford and Hand  | 7786           |
| fraud_Reilly LLC                    | 7754           |
| fraud_Botsford PLC                  | 7716           |
| fraud_Champlin-Casper               | 7708           |
| fraud_Wuckert-Goldner               | 7674           |
| fraud_Botsford and Sons             | 7674           |
| fraud_Windler, Goodwin and Kovacek  | 7671           |
| fraud_Schmidt-Larkin                | 7669           |
| fraud_Hettinger, McCullough and Fay | 7669           |
| fraud_Schiller, Blanda and Johnson  | 7660           |
| fraud_Pollich LLC                   | 7655           |
| fraud_White and Sons                | 7646           |
| fraud_Cole, Hills and Jewess        | 7645           |
| fraud_Padberg-Sauer           

In [31]:
clean_df = consolidate_top_categories(clean_df, 'job', top_n=20)

--- Value Counts for 'job' after Consolidation ---
| job                                           |           count |
|:----------------------------------------------|----------------:|
| Other                                         |     1.84333e+06 |
| Patent attorney                               | 12343           |
| Child psychotherapist                         | 10910           |
| Social researcher                             | 10651           |
| Information systems manager                   |  9887           |
| Armed forces operational officer              |  9261           |
| Surveyor, minerals                            |  8836           |
| Engineer, structural                          |  8764           |
| Magazine journalist                           |  8464           |
| Engineer, mining                              |  8438           |
| Art therapist                                 |  8052           |
| Biomedical engineer                           |  8045          

In [32]:
clean_df['is_male'] = clean_df['gender'].apply(lambda x: 1 if x == 'M' else 0)

In [33]:
clean_df['job'] = clean_df['job'].astype('string')
clean_df['job'] = clean_df['job'].str.lower()
clean_df['merchant'] = clean_df['merchant'].str.lower()

In [34]:
clean_df['city'] = clean_df['city'].astype('string')
clean_df['city'] = clean_df['city'].str.lower()

In [35]:
clean_df['dob'] = pd.to_datetime(clean_df['dob'])
clean_df['trans_date'] = pd.to_datetime(clean_df['trans_date'])
time_str = clean_df['trans_time'].astype('string')

clean_df['trans_timestamp'] = clean_df['trans_date'].dt.strftime('%Y-%m-%d') + ' ' + time_str
clean_df['trans_timestamp'] = pd.to_datetime(clean_df['trans_timestamp'], format='%Y-%m-%d %H:%M:%S')

clean_df['category'] = clean_df['category'].astype('string')
clean_df['profile'] = clean_df['profile'].astype('string')
clean_df['profile'] = clean_df['profile'].str.lower()
clean_df['category'] = clean_df['category'].str.lower()

clean_df['merchant'] = clean_df['merchant'].astype('string')
clean_df['trans_num'] = clean_df['trans_num'].astype('string')
clean_df['ssn'] = clean_df['ssn'].astype('string')




In [None]:
clean_df = clean_df.drop(columns=['first', 'last', 'street', 'unix_time', 'gender', 'state', 'trans_time'])

In [37]:
clean_df.info(max_cols=30)

<class 'pandas.core.frame.DataFrame'>
Index: 2013945 entries, 0 to 17284449
Data columns (total 22 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Unnamed: 0       int64         
 1   ssn              string        
 2   cc_num           int64         
 3   city             string        
 4   zip              int64         
 5   lat              float64       
 6   long             float64       
 7   city_pop         int64         
 8   job              string        
 9   dob              datetime64[ns]
 10  acct_num         int64         
 11  profile          string        
 12  trans_num        string        
 13  trans_date       datetime64[ns]
 14  category         string        
 15  amt              float64       
 16  is_fraud         int64         
 17  merchant         string        
 18  merch_lat        float64       
 19  merch_long       float64       
 20  is_male          int64         
 21  trans_timestamp  datetime64[ns]
dty

In [38]:
file_path = 'prepped_data.pkl'
clean_df.to_pickle(file_path)

print(f"✅ Data saved successfully to: {file_path}")

✅ Data saved successfully to: prepped_data.pkl
