# 🧹 Data Preprocessing & EDA
This notebook handles missing values, performs data cleaning, merges geolocation data, performs feature engineering, and prepares data for modeling.

### 📦 Imports

In [161]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import ipaddress


## 📂 Load Data

In [162]:
fraud_df = pd.read_csv('../../data/raw/Fraud_Data.csv')
ip_df = pd.read_csv('../../data/raw/IpAddress_to_Country.csv')
cc_df = pd.read_csv('../../data/raw/creditcard.csv')

In [163]:
fraud_df.sample(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
101157,345341,2015-05-09 15:36:30,2015-06-03 13:01:19,18,CSWSYXXJQYJEQ,Ads,Chrome,F,36,3787511000.0,0
856,190496,2015-07-26 09:30:26,2015-08-21 20:09:18,30,VJYDTUKTVMFTJ,Direct,Safari,M,35,3842933000.0,0
140042,28267,2015-07-22 01:02:51,2015-07-24 17:54:58,50,BVKJREOYSBUFM,SEO,IE,M,32,1365387000.0,0
52072,297815,2015-07-21 04:59:13,2015-08-12 03:06:23,34,CEYEQAWGTVOZN,Direct,Chrome,M,29,2300459000.0,0
73080,234125,2015-06-19 02:20:10,2015-07-09 03:06:14,38,DLBTITZJRFMRZ,Ads,Chrome,F,41,1445593000.0,0


In [164]:
ip_df.sample(5)

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
54607,3104713000.0,3104713727,Russian Federation
114992,3406633000.0,3406633215,China
69361,3230868000.0,3230868479,United States
18358,1495267000.0,1495269375,Sweden
72237,3233720000.0,3233720575,United States


In [165]:
cc_df.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
125965,77878.0,-1.759075,0.213638,0.838361,-1.793215,-0.663088,-0.659998,-0.665247,0.902891,-1.500009,...,-0.211032,-0.681568,-0.238735,-0.273718,0.159588,1.009719,-0.408485,-0.1629,0.95,0
122349,76527.0,-0.153722,0.086604,1.823014,-0.833608,-0.817505,-0.058096,-0.397143,0.259415,-1.456034,...,0.290229,0.711955,-0.042759,0.212116,-0.613315,-0.203033,0.166172,0.130151,2.5,0
193056,129974.0,-0.51399,1.086294,2.008046,4.557913,0.682915,1.27229,0.07438,0.374332,-1.471273,...,0.101431,0.419645,-0.41344,0.67983,0.496347,0.683957,0.08925,0.089096,1.51,0
167918,118964.0,1.995431,-0.897238,-1.242638,-0.717488,-0.645081,-0.297513,-0.997411,0.088555,-0.188549,...,0.321742,0.828099,0.094594,0.602728,-0.205192,-0.142937,0.012529,-0.006165,69.99,0
8128,11030.0,-0.999581,1.069343,1.537029,-1.030109,-0.258109,-0.348464,0.021868,0.361678,0.708899,...,0.125665,0.403617,-0.191658,0.028897,-0.253035,0.818234,-0.251397,0.021326,15.95,0


## 🧼 Data Cleaning

### Missing Values

In [166]:
#count missing values in the df
fraud_df.isna().sum()

user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

In [167]:
#check for missing values in the ip df
ip_df.isna().sum()

lower_bound_ip_address    0
upper_bound_ip_address    0
country                   0
dtype: int64

In [168]:
#check for missing values in credit card df
cc_df.isna().sum().sum()

0

In [169]:
# Convert timestamps
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

# Remove duplicates
fraud_df.drop_duplicates(inplace=True)

In [170]:
fraud_df

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,7.327584e+08,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,3.503114e+08,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2.621474e+09,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3.840542e+09,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,4.155831e+08,0
...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,XPSKTWGPWINLR,SEO,Chrome,M,28,3.451155e+09,1
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,LYSFABUCPCGBA,SEO,Safari,M,32,2.439047e+09,0
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,MEQHCSJUBRBFE,SEO,IE,F,26,2.748471e+09,0
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,CMCXFGRHYSTVJ,SEO,Chrome,M,37,3.601175e+09,0


In [171]:
fraud_df[fraud_df['user_id']==1359]

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1


In [172]:
ip_df.dtypes

lower_bound_ip_address    float64
upper_bound_ip_address      int64
country                    object
dtype: object

## 🌐 IP Geolocation Mapping

In [173]:
fraud_df['ip_int'] = fraud_df['ip_address'].astype(float).astype(int)

# Make sure bounds are int
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].astype(int)
ip_df['upper_bound_ip_address'] = ip_df['upper_bound_ip_address'].astype(int)

# Merge based on range: use pd.merge_asof or manual interval join
merged_df = pd.merge_asof(
    fraud_df.sort_values('ip_int'),
    ip_df.sort_values('lower_bound_ip_address'),
    left_on='ip_int',
    right_on='lower_bound_ip_address',
    direction='backward'
)

# Make sure bounds are int
merged_df['lower_bound_ip_address'] = merged_df['lower_bound_ip_address'].fillna(-1).astype('int')
merged_df['upper_bound_ip_address'] = merged_df['upper_bound_ip_address'].fillna(-1).astype('int')

# Optional: filter where ip_int is also <= upper_bound_ip_address
merged_df = merged_df[merged_df['ip_int'] <= merged_df['upper_bound_ip_address']]



In [174]:
# Check how many are missing
missing = merged_df['country'].isna().sum()
print(f"Missing countries: {missing}")

# Option 1: Fill with 'Unknown'
merged_df['country'].fillna('Unknown', inplace=True)

# Option 2: Drop if you're okay losing those rows
# fraud_df = fraud_df.dropna(subset=['country'])

Missing countries: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['country'].fillna('Unknown', inplace=True)


In [175]:
merged_df.sample(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,lower_bound_ip_address,upper_bound_ip_address,country
127266,390772,2015-03-27 00:47:40,2015-07-12 12:08:02,32,LWTLZOXFHUBFI,SEO,IE,M,48,3635993000.0,0,3635992557,3635970048,3636002815,United States
112895,318628,2015-04-17 11:43:38,2015-05-10 05:01:00,58,WRHWYMWREVTWS,Direct,IE,M,29,3231212000.0,0,3231212054,3231212032,3231212287,Switzerland
8897,35181,2015-04-22 23:56:51,2015-06-26 08:17:10,11,TZJGKQLFCIFGG,Ads,Safari,F,42,243177100.0,0,243177066,242221056,243269631,China
91576,234108,2015-01-18 08:03:18,2015-03-28 15:02:36,46,AYXPNYNFSSATZ,SEO,FireFox,M,48,2610384000.0,0,2610384022,2610364416,2610429951,United States
69080,23139,2015-07-28 06:59:12,2015-11-15 13:54:09,9,DAWETKXLRDSQL,Ads,IE,M,44,1977142000.0,0,1977142457,1975517184,1979711487,India


In [176]:
merged_df.isna().sum().sort_values(ascending=False)

user_id                   0
signup_time               0
purchase_time             0
purchase_value            0
device_id                 0
source                    0
browser                   0
sex                       0
age                       0
ip_address                0
class                     0
ip_int                    0
lower_bound_ip_address    0
upper_bound_ip_address    0
country                   0
dtype: int64

In [177]:
# Merge with country info
merged_df = pd.merge_asof(
    fraud_df.sort_values('ip_int'),
    ip_df.sort_values('lower_bound_ip_address'),
    left_on='ip_int', right_on='lower_bound_ip_address'
)
# Make sure bounds are int
merged_df['lower_bound_ip_address'] = merged_df['lower_bound_ip_address'].fillna(-1).astype('int')
merged_df['upper_bound_ip_address'] = merged_df['upper_bound_ip_address'].fillna(-1).astype('int')

In [178]:
merged_df.sample(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,lower_bound_ip_address,upper_bound_ip_address,country
103626,157607,2015-01-06 06:01:57,2015-04-14 03:29:01,50,TFUYBVNWCZSFX,Ads,FireFox,F,19,2961057000.0,0,2961057262,2961057024,2961057279,Ireland
77268,66457,2015-01-02 12:16:13,2015-01-02 12:16:14,39,FFYVRWMTLSYPY,Ads,FireFox,M,32,2202518000.0,1,2202518487,2202468352,2202533887,United States
17767,36844,2015-06-22 10:44:51,2015-10-18 13:09:59,38,PIVAPUFFKHXEB,SEO,FireFox,M,34,488686800.0,0,488686795,486539264,503316479,United States
38670,204464,2015-06-14 15:47:57,2015-09-08 14:45:26,94,EVBRLIHLXGUEI,SEO,Chrome,F,40,1110704000.0,0,1110704285,1110704128,1110835199,United States
43281,65286,2015-04-09 12:00:19,2015-04-23 18:44:05,35,PVZCDFLNZYYRI,Ads,Chrome,F,25,1235775000.0,0,1235775131,1224736768,1241513983,United States


## 🧠 Feature Engineering

In [179]:
# Time since signup
merged_df['time_since_signup'] = (merged_df['purchase_time'] - merged_df['signup_time']).dt.total_seconds() / 3600
# Hour and day of transaction
# merged_df['hour_of_day'] = merged_df['purchase_time'].dt.hour
# merged_df['day_of_week'] = merged_df['purchase_time'].dt.dayofweek

merged_df['dayofweek'] = merged_df['purchase_time'].dt.dayofweek
merged_df['hour'] = merged_df['purchase_time'].dt.hour
merged_df['is_weekend'] = merged_df['purchase_time'].dt.dayofweek >= 5

merged_df.drop(columns=['purchase_time','signup_time'],inplace=True)



In [180]:
merged_df.sample(5)

Unnamed: 0,user_id,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,lower_bound_ip_address,upper_bound_ip_address,country,time_since_signup,dayofweek,hour,is_weekend
76157,301321,22,BREXLYSXMSBIH,Direct,Chrome,M,45,2171739000.0,0,2171739074,2171731968,2171797503,United States,2382.229722,5,19,True
67553,306182,38,JFRGGSPKFJSBD,Direct,FireFox,M,23,1936226000.0,0,1936225567,1936195584,1936457727,China,479.479444,0,11,False
130950,141233,26,PUFHLHGVEVFHM,SEO,Chrome,F,42,3737276000.0,0,3737276399,3737124864,3737387007,China,936.664722,1,20,False
55622,315550,34,NGFBKUSDRMCRH,SEO,Chrome,M,23,1571428000.0,0,1571427808,1571422208,1571553279,Czech Republic,641.565556,5,0,True
23582,228458,20,SSSTXKFVAPGXT,Direct,Safari,M,22,647374200.0,1,647374210,637534208,654311423,United States,283.407778,6,8,True


In [181]:
merged_df.rename(columns={
    'class': 'is_fraud',
    'ip_address': 'ip_address_raw'
}, inplace=True)


### Rename columns 

In [182]:
merged_df.sample()

Unnamed: 0,user_id,purchase_value,device_id,source,browser,sex,age,ip_address_raw,is_fraud,ip_int,lower_bound_ip_address,upper_bound_ip_address,country,time_since_signup,dayofweek,hour,is_weekend
141455,43730,47,NAMOUEGTBRCOQ,SEO,Chrome,M,28,4028427000.0,0,4028427130,3758096128,3758096383,Australia,1360.5975,3,5,False


### Reorder important columns

In [183]:
cols = [
    'user_id', 'time_since_signup', 'time_since_signup',
    'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age',
    'ip_address_raw', 'ip_int', 'country', 'hour', 'dayofweek','is_weekend',
    'is_fraud'
]
final_df = merged_df[cols]


In [184]:
# Transaction frequency and velocity features
fraud_df.groupby('user_id')['purchase_time'].count()
fraud_df.groupby('device_id')['purchase_time'].nunique().sort_values(ascending=False)

device_id
KIPFSCNUGOLDP    20
NGQCKIADMZORL    20
CQTUVBYIWWWBC    20
ITUMJCKWEYNDD    20
EQYVNEGOFLAWK    20
                 ..
IXVBQLPWSAIDA     1
IXUYCZZVDXGPO     1
IXUWTGATQJEVG     1
IXUKDXQBVCYCZ     1
ZZZXASJUVUNMV     1
Name: purchase_time, Length: 137956, dtype: int64

In [185]:
#Handle Class Imbalance
fraud_df['class'].value_counts(normalize=True)

class
0    0.906354
1    0.093646
Name: proportion, dtype: float64

In [186]:
final_df.sample(5)

Unnamed: 0,user_id,time_since_signup,time_since_signup.1,purchase_value,device_id,source,browser,sex,age,ip_address_raw,ip_int,country,hour,dayofweek,is_weekend,is_fraud
107147,64621,2617.706944,2617.706944,18,ZVCDWXKLTZODS,SEO,IE,F,39,3059105000.0,3059105182,China,18,5,True,0
23430,290485,1693.424722,1693.424722,68,YGSLLUXMXOPSZ,Ads,FireFox,M,42,643869200.0,643869185,United States,3,1,False,0
129621,89283,1383.159444,1383.159444,17,BAJJCIJNEEIXZ,Direct,IE,M,31,3699778000.0,3699778221,Taiwan; Republic of China (ROC),12,4,False,0
108087,294991,1842.256389,1842.256389,35,QVSWOFKVZCBTP,Direct,FireFox,F,49,3085132000.0,3085132255,China,2,4,False,0
122363,327017,860.930556,860.930556,21,NFTVBIHBFYEKG,SEO,Chrome,F,23,3503462000.0,3503462036,United States,10,6,True,1


### Normalization and Scaling

In [187]:
# Choose numeric features to scale
numeric_features = ['purchase_value', 'age', 'time_since_signup', 'hour', 'dayofweek','is_weekend']

scaler = StandardScaler()
final_df[numeric_features] = scaler.fit_transform(final_df[numeric_features])

# Optional: view result
print(final_df[numeric_features].head())


   purchase_value       age  time_since_signup  time_since_signup      hour  \
0        0.494721  0.331793          -1.013679          -1.013679 -0.220124   
1       -0.214781 -0.364448          -1.230613          -1.230613  0.792542   
2       -0.214781 -0.132367          -1.337931          -1.337931 -0.509456   
3       -0.214781  0.795954           0.800513           0.800513  1.371208   
4        0.985915  0.563874          -1.127359          -1.127359 -0.654123   

   dayofweek  is_weekend  
0   1.489476    1.568716  
1   0.492565   -0.637464  
2  -1.002803   -0.637464  
3  -0.005891   -0.637464  
4   1.489476    1.568716  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[numeric_features] = scaler.fit_transform(final_df[numeric_features])


### Encoding Categorical Features

In [188]:
cols_to_encode=['source','browser','sex','country','device_id']
le = LabelEncoder()
for col in cols_to_encode:
    final_df[col] = le.fit_transform(final_df[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[col] = le.fit_transform(final_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[col] = le.fit_transform(final_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[col] = le.fit_transform(final_df[col])
A value is trying to be set on a copy of a slice from a DataF

In [189]:
final_df.sample(5)

Unnamed: 0,user_id,time_since_signup,time_since_signup.1,purchase_value,device_id,source,browser,sex,age,ip_address_raw,ip_int,country,hour,dayofweek,is_weekend,is_fraud
82356,83582,0.437801,0.437801,-0.651398,35507,0,0,0,0.679914,2343657000.0,2343656902,171,-0.075457,-1.501259,-0.637464,0
2565,305525,0.543715,0.543715,-0.869706,23149,1,2,1,0.795954,69387450.0,69387447,171,0.503209,0.492565,-0.637464,0
110122,90441,-0.882434,-0.882434,1.367955,43410,2,0,1,-0.248408,3154623000.0,3154623484,60,-1.666788,-1.002803,-0.637464,0
26884,37426,-0.003033,-0.003033,2.077457,64414,1,2,0,-0.016327,736641400.0,736641420,84,0.792542,-0.005891,-0.637464,0
91563,45266,-0.662795,-0.662795,-0.924284,31908,2,0,0,0.215753,2610085000.0,2610084804,171,-0.36479,0.492565,-0.637464,0


### Save to CSV for modeling:

In [190]:
final_df.to_csv("../../data/interim/interim_fraud_data.csv", index=False)

In [191]:
final_df.sample(5)

Unnamed: 0,user_id,time_since_signup,time_since_signup.1,purchase_value,device_id,source,browser,sex,age,ip_address_raw,ip_int,country,hour,dayofweek,is_weekend,is_fraud
542,369157,0.223489,0.223489,-0.105627,39976,2,2,1,-0.132367,14504030.0,14504026,181,-1.522122,1.489476,1.568716,0
58128,31864,-0.967919,-0.967919,-0.542244,36424,0,0,1,0.679914,1638991000.0,1638991029,171,-0.798789,-1.501259,-0.637464,0
84711,317088,-0.81513,-0.81513,-1.033438,12299,1,0,0,-0.596528,2408810000.0,2408810138,171,1.226541,1.489476,1.568716,0
127840,388080,-1.564185,-1.564185,-1.306323,38335,1,0,1,-0.944649,3651237000.0,3651236699,56,-0.654123,-0.504347,-0.637464,0
124333,256718,-0.170268,-0.170268,-0.43309,99123,0,2,1,-1.060689,3557167000.0,3557167258,139,1.515874,-1.501259,-0.637464,0


In [192]:
final_df['user_id'].is_unique

True

## ⚖️ Handle Class Imbalance (SMOTE)

In [193]:
features = merged_df[['purchase_value', 'time_since_signup', 'hour', 'dayofweek','is_weekend']]
target = merged_df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, stratify=target)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [194]:
X_resampled.shape

(191744, 5)

## Credit

In [195]:
cc_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


### class inbalance

In [196]:
cc_df['Class'].value_counts(normalize=True)

Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64

### scaling

In [197]:
cc_df['scaled_time'] = scaler.fit_transform(cc_df[['Time']])
cc_df['scaled_amount'] = scaler.fit_transform(cc_df[['Amount']])

#drop originals
cc_df.drop(['Time', 'Amount'], axis=1, inplace=True)
cc_df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,scaled_time,scaled_amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,-1.996583,0.244964
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,0,-1.996583,-0.342475
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,-1.996562,1.160686
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,-1.996562,0.140534
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,0,-1.996541,-0.073403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0,1.641931,-0.350151
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,0,1.641952,-0.254117
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,0,1.641974,-0.081839
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,0,1.641974,-0.313249


In [198]:
cc_df.to_csv("../../data/interim/interim_creditcard.csv", index=False)