In [10]:
# Import necessary libraries
import pandas as pd

# Load the dataset
file_path = '/Users/tetianabovanenko/Downloads/fraud_data.csv'  # Replace with your actual file path
data = pd.read_csv(file_path)

# Display the first few rows to confirm successful loading
data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,04-01-2019 00:58,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1
1,04-01-2019 15:06,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1
2,04-01-2019 22:37,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1
3,04-01-2019 23:06,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1
4,04-01-2019 23:59,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1


In [13]:
# Handle inconsistent date formats by setting dayfirst=True
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], dayfirst=True, format='%d-%m-%Y %H:%M')
data['dob'] = pd.to_datetime(data['dob'], dayfirst=True, format='%d-%m-%Y')

# Check if the conversion was successful
data[['trans_date_trans_time', 'dob']].head()


Unnamed: 0,trans_date_trans_time,dob
0,2019-01-04 00:58:00,1939-11-09
1,2019-01-04 15:06:00,1939-11-09
2,2019-01-04 22:37:00,1939-11-09
3,2019-01-04 23:06:00,1939-11-09
4,2019-01-04 23:59:00,1939-11-09


In [15]:
# Inspect unique values in the is_fraud column
print(data['is_fraud'].unique())

# Clean the is_fraud column by removing any extraneous characters
data['is_fraud'] = data['is_fraud'].str.extract(r'(\d)').astype(int)

# Verify the cleaned column
data['is_fraud'].unique()

['1' '1"2020-12-24 16:56:24"' '0' '0"2019-01-01 00:00:44"']


array([1, 0])

In [17]:
# Extract new features from the date columns
data['transaction_year'] = data['trans_date_trans_time'].dt.year
data['transaction_month'] = data['trans_date_trans_time'].dt.month
data['transaction_day'] = data['trans_date_trans_time'].dt.day
data['transaction_hour'] = data['trans_date_trans_time'].dt.hour
data['age'] = data['transaction_year'] - data['dob'].dt.year

# Check the newly created columns
data[['transaction_year', 'transaction_month', 'transaction_day', 'transaction_hour', 'age']].head()


Unnamed: 0,transaction_year,transaction_month,transaction_day,transaction_hour,age
0,2019,1,4,0,80
1,2019,1,4,15,80
2,2019,1,4,22,80
3,2019,1,4,23,80
4,2019,1,4,23,80


In [19]:
# Dropping redundant columns
data_cleaned = data.drop(columns=['trans_date_trans_time', 'dob', 'trans_num'])

# Preview the cleaned dataset
data_cleaned.head()

Unnamed: 0,merchant,category,amt,city,state,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,transaction_year,transaction_month,transaction_day,transaction_hour,age
0,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",65.654142,-164.722603,1,2019,1,4,0,80
1,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",65.468863,-165.473127,1,2019,1,4,15,80
2,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",65.347667,-165.914542,1,2019,1,4,22,80
3,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",64.445035,-166.080207,1,2019,1,4,23,80
4,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",65.447094,-165.446843,1,2019,1,4,23,80


In [21]:
# Export the cleaned data to a CSV file
data_cleaned.to_csv('/Users/tetianabovanenko/Downloads/cleaned_fraud_data.csv', index=False)  # Replace with your desired output path