In [1]:
#Import the basic libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
# Loading dataset
df_test = pd.read_csv('fraudTest.csv')
df_test.drop(df_test.columns[0], axis=1, inplace=True)
df_test.sample(2)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
225193,2020-09-09 12:28:13,3575540972310993,"fraud_Cremin, Hamill and Reichel",misc_pos,2.16,Rachel,Villarreal,F,250 Carrie Throughway,University,...,34.3396,-89.5736,4198,Curator,2001-06-22,44900ee45b8228a24690c63db8ac211d,1378729693,34.577138,-90.458539,0
185523,2020-08-25 04:04:49,370877495212014,fraud_McDermott-Weimann,grocery_pos,207.6,Sarah,Clark,F,25961 Beverly Union Apt. 042,North Wilkesboro,...,36.2017,-81.1286,21134,Dispensing optician,1984-03-06,0425d4777e673aea01833e6b89f590a5,1377403489,36.073214,-80.181194,0


#### Lets  clean the test_df set

In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [9]:
# Rename columns
df_test.rename(columns={'trans_date_trans_time':'transaction_time', 
                         "cc_num":"credit_card_no", "amt":"amount(usd)", 
                         "trans_num":"transaction_id"}, 
                inplace = True)

In [10]:
# convert data type (to datetime format & from unix_time to datetime format)

from datetime import datetime,date

df_test['transaction_time'] = pd.to_datetime(df_test['transaction_time'],infer_datetime_format=True)

#converting from epoch time
df_test['time'] = df_test['unix_time'].apply(datetime.utcfromtimestamp)

In [11]:
# create new columns: 'Age' from Dob & hourly_basis

# create new column on hourly basis
df_test['hourly_basis'] = df_test['time'].dt.hour


# change to datetime
df_test['dob'] = pd.to_datetime(df_test['dob'])

# change the format to MM-DD-YYYY
df_test['dob'] = df_test['dob'].dt.strftime('%m-%d-%Y')

# This function converts given date to age

def age(born):
    born = datetime.strptime(born, '%m-%d-%Y').date()
    today = date.today()
    return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))
   
df_test['Age'] = df_test['dob'].apply(age)

In [13]:
# convert data type
df_test['credit_card_no'] = df_test['credit_card_no'].astype('category')

In [14]:
# Fixing skewness
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

qt = QuantileTransformer(n_quantiles=500, output_distribution='normal')
x = np.array(df_test['amount(usd)']).reshape(-1, 1) 
y = qt.fit_transform(x)
df_test['amount_qt'] = y 

In [15]:
# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

OE = OrdinalEncoder(dtype=np.int64)
df_test.loc[:,['category','merchant','job','transaction_id']] = OE.fit_transform(df_test.loc[:,['category','merchant','job','transaction_id']])

In [17]:
df_test.to_csv('dftest_cleaned.csv')