In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier

In [None]:
train = pd.read_csv('Projects/credit_card_fraud_detection/fraudTrain.csv/fraudTrain.csv')

test = pd.read_csv('Projects/credit_card_fraud_detection/fraudTest.csv/fraudTest.csv')

df = pd.read_csv('credit_card_fraud_detection/fraudTrain.csv/fraudTrain.csv')

pd.DataFrame(train)

train.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
train.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


In [None]:
def add_features(df):
    # Moratab sazi bar asase shomare kart va zamane tarakonesh
    df = df.sort_values(['cc_num', 'trans_date_trans_time'])
    
    # Tabdile sotun haye zamani be formate standarde datetime
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['dob'] = pd.to_datetime(df['dob'])

    # Sakhte ye dataframe movaghat
    temp = df.set_index("trans_date_trans_time")
    grp = temp.groupby('cc_num')['amt']
    
    # Mohasebe ye tedad va miangine tarakonesh ha dar 24 saate gozashte
    df['trans_count_24h'] = grp.rolling('24h', closed='left').count().values
    df['mean_amt'] = grp.rolling('24h', closed='left').mean().values
    
    # Por kardane maghadire khali (NaN) ba 0 ya meghdare khode tarakonesh
    df['trans_count_24h'] = df['trans_count_24h'].fillna(0)
    df['mean_amt'] = df['mean_amt'].fillna(df['amt'])

    # Mohasebeye fasele zamani (be sanie) az akharin tarakonesh
    df["seconds_from_last_transact"] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds()
    df["seconds_from_last_transact"] = df["seconds_from_last_transact"].fillna(-1)

    # Estekhraje saat, rooze hafte va senne karbar
    df["hour"] = df["trans_date_trans_time"].dt.hour
    df["day"] = df["trans_date_trans_time"].dt.dayofweek
    df["age"] = (df["trans_date_trans_time"]-df['dob']).dt.days//365

    # Mohasebeye faseleye joghrafiayi beyne karbar va forushgah (Euclidean distance)
    df['dist'] = np.sqrt((df['lat'] - df['merch_lat'])**2 + (df['long'] - df['merch_long'])**2)

    return df



In [None]:
def preprocess(df, encoder=None, train=True):
    # Ezafe kardane feature haye jadid 
    df = add_features(df)

    # Liste sotun haye categorical baraye tabdil be adad
    cat_cols = ['merchant', 'category', 'gender', 'job']

    if train:
        # Sakhte encoder baraye dadehaye Train va handling maghadire jadid
        encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        df[cat_cols] = encoder.fit_transform(df[cat_cols])
    else:
        # Estefade az hamon encodere ghabli baraye dade-haye Test
        df[cat_cols] = encoder.transform(df[cat_cols])
    
    # Hazfe sotunhayi ke dar modelsazi tasir nadaran ya ghablan estefade shodan
    df = df.drop(columns=['Unnamed: 0', 'first', 'last', 'street', 'city',
        'state', 'zip', 'trans_num', 'dob',
        'trans_date_trans_time', 'cc_num'])

    return df, encoder

# Ejraye preprocess ruye dataframe asli
preprocess(df)


(         merchant  category     amt  gender      lat      long  city_pop  \
 1017        293.0       8.0    7.27     0.0  43.0048 -108.8964      1645   
 2724         43.0       2.0   52.94     0.0  43.0048 -108.8964      1645   
 2726        399.0       2.0   82.08     0.0  43.0048 -108.8964      1645   
 2882        126.0       7.0   34.79     0.0  43.0048 -108.8964      1645   
 2907         41.0       6.0   27.18     0.0  43.0048 -108.8964      1645   
 ...           ...       ...     ...     ...      ...       ...       ...   
 1294934      44.0      10.0   60.47     1.0  41.1730  -89.2187       532   
 1295369      47.0       2.0   74.29     1.0  41.1730  -89.2187       532   
 1295587     503.0      11.0  246.56     1.0  41.1730  -89.2187       532   
 1296206     287.0      12.0    2.62     1.0  41.1730  -89.2187       532   
 1296427     508.0       2.0   39.29     1.0  41.1730  -89.2187       532   
 
            job   unix_time  merch_lat  merch_long  is_fraud  trans_count_

In [None]:
train = pd.read_csv('credit_card_fraud_detection/fraudTrain.csv/fraudTrain.csv')

train,encoder = preprocess(train,train = True)

X_train = train.drop("is_fraud", axis=1)
y_train = train["is_fraud"]
scale = (y_train == 0).sum()/(y_train == 1).sum()

In [7]:
model = LGBMClassifier(n_estimators=1000,learning_rate=0.05,scale_pos_weight = scale)
model.fit(X_train,y_train)

[LightGBM] [Info] Number of positive: 7506, number of negative: 1289169
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3203
[LightGBM] [Info] Number of data points in the train set: 1296675, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005789 -> initscore=-5.146050
[LightGBM] [Info] Start training from score -5.146050


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
test = pd.read_csv("Projects/credit_card_fraud_detection/fraudTest.csv/fraudTest.csv")

test,_ = preprocess(test, encoder,train = False)

X_test = test.drop('is_fraud',axis=1)
y_test = test["is_fraud"]

model_pred = (model.predict_proba(X_test)[:,1]>0.8).astype(int)

print(classification_report(y_test,model_pred))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.92      0.91      0.92      2145

    accuracy                           1.00    555719
   macro avg       0.96      0.96      0.96    555719
weighted avg       1.00      1.00      1.00    555719

