In [1]:
import pandas as pd
import numpy as np

In [2]:
fraud_df1=pd.read_csv("fraud_transactions.csv")

In [3]:
loan_df1=pd.read_csv("loan_data.csv")

In [4]:
fraud_df1.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V27,V28,Amount,Class,log_amount,amount_bin,hour_of_day,is_night,V1_sq,V2_V3
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.133558,-0.021053,149.62,0,5.01476,4,0.0,1,1.849075,-0.184598
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.008983,0.014724,2.69,0,1.305626,0,0.0,1,1.420523,0.044309
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,-0.055353,-0.059752,378.66,0,5.939276,4,0.0,1,1.845126,-2.37639
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,0.062723,0.061458,123.5,0,4.824306,4,0.0,1,0.933681,-0.332109
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.219422,0.215153,69.99,0,4.262539,3,0.0,1,1.341504,1.359367


In [5]:
loan_df1.head(5)

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amount,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22,female,Master,71948,0,RENT,35000,PERSONAL,16.02,0.49,3,561,No,1
1,21,female,High School,12282,0,OWN,1000,EDUCATION,11.14,0.08,2,504,Yes,0
2,25,female,High School,12438,3,MORTGAGE,5500,MEDICAL,12.87,0.44,3,635,No,1
3,23,female,Bachelor,79753,0,RENT,35000,MEDICAL,15.23,0.44,2,675,No,1
4,24,male,Master,66135,1,RENT,35000,MEDICAL,14.27,0.53,4,586,No,1


In [6]:
# Feature engineering performed on fraud transaction data to transform raw variables into informative features

fraud_df1['log_amount'] = np.log1p(fraud_df1['Amount'])

fraud_df1['amount_bin'] = pd.qcut(fraud_df1['Amount'], 5, labels=False)

fraud_df1['hour_of_day'] = (fraud_df1['Time'] % (24*3600)) // 3600

fraud_df1['is_night'] = fraud_df1['hour_of_day'].apply(lambda x: 1 if x < 6 else 0)

fraud_df1['V1_sq'] = fraud_df1['V1'] ** 2

fraud_df1['V2_V3'] = fraud_df1['V2'] * fraud_df1['V3']


In [7]:
fraud_df1.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V27,V28,Amount,Class,log_amount,amount_bin,hour_of_day,is_night,V1_sq,V2_V3
241068,150869.0,2.113637,-0.093677,-1.360988,0.256903,0.157456,-0.828854,0.139845,-0.266851,0.651723,...,-0.074054,-0.068101,0.26,0,0.231112,0,17.0,0,4.467462,0.127494
62375,50245.0,0.311411,-1.877819,0.400605,-0.334004,-1.67759,-0.626816,0.067376,-0.170639,1.73218,...,0.006042,0.120533,484.5,0,6.185179,4,13.0,0,0.096977,-0.752263
28493,35032.0,-1.40314,-1.282165,0.684218,-1.447717,0.060447,1.921875,1.661197,0.148781,-2.095181,...,-0.239326,-0.166108,429.0,0,6.063785,4,9.0,0,1.968802,-0.877281
284101,172152.0,-2.525565,0.70366,-0.298992,-0.123133,-0.442029,-1.218844,0.685789,0.643261,-0.546895,...,0.440683,0.106358,123.19,0,4.821813,4,23.0,0,6.378477,-0.210389
182381,125372.0,1.82392,-1.618256,-0.766254,-0.837007,-1.215481,-0.121801,-1.024127,0.116235,0.050311,...,-0.03672,-0.026256,169.0,0,5.135798,4,10.0,0,3.326683,1.239995


In [8]:
# Loans
loan_df1['debt_to_income'] = loan_df1['loan_amount'] / loan_df1['person_income']

loan_df1['emp_length_bucket'] = pd.cut(loan_df1['person_emp_exp'], bins=[-1,0,1,5,10,50], labels=["0","<1","1-5","5-10","10+"])

loan_df1['credit_score_bucket'] = pd.cut(loan_df1['credit_score'], bins=[300,580,670,740,800,850], labels=["Poor","Fair","Good","Very Good","Excellent"])

loan_df1['prev_default_flag'] = loan_df1['previous_loan_defaults_on_file'].map({'Yes': 1, 'No': 0})


In [9]:
#distributes income in 4 equal groups
#bottom 25%, next 25%, next 25%, top 25%

loan_df1['income_bucket'] = pd.qcut(
    loan_df1['person_income'],
    q=4,   # quartiles → 4 groups
    labels=["Low", "Medium", "High", "Very High"]
)

loan_df1['income_intent'] = loan_df1['income_bucket'].astype(str) + "_" + loan_df1['loan_intent']



In [10]:
loan_df1.sample(5)

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amount,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status,debt_to_income,emp_length_bucket,credit_score_bucket,prev_default_flag,income_bucket,income_intent
16662,24,male,Master,102655,1,MORTGAGE,7500,EDUCATION,13.99,0.07,4,605,Yes,0,0.07306,<1,Fair,1,Very High,Very High_EDUCATION
34980,30,female,Associate,95971,9,MORTGAGE,14968,VENTURE,11.04,0.16,7,606,No,0,0.155964,5-10,Fair,0,Very High,Very High_VENTURE
34188,26,female,Associate,41472,3,OWN,8000,PERSONAL,9.43,0.19,7,631,Yes,0,0.192901,1-5,Fair,1,Low,Low_PERSONAL
9391,26,female,Master,75186,5,MORTGAGE,12000,PERSONAL,7.9,0.16,2,594,No,0,0.159604,1-5,Fair,0,High,High_PERSONAL
31734,45,male,Doctorate,142320,22,MORTGAGE,9600,EDUCATION,13.99,0.07,12,745,No,1,0.067454,10+,Very Good,0,Very High,Very High_EDUCATION


In [11]:
#saving files
fraud_df1.to_csv("fraud_features.csv", index=False)


In [12]:
loan_df1.to_csv("loan_features.csv", index=False)