In [1]:
# Importer les packages
import pandas as pd
import seaborn as sb
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Data Preprocessing & Handling Data
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Optional for Deep Learning (if needed)
import tensorflow as tf
from tensorflow import keras

In [2]:
#Faire le connexion entre le drive et colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#LRead the csv file
data=pd.read_csv('/content/drive/MyDrive/Future_Intern/credit_card_transactions.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,28705.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,83236.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,22844.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 24 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [6]:
data.shape

(1296675, 24)

#Data Preprocessing

In [7]:
#Categorical values
df_cat = data.select_dtypes(include=['object', 'category']).columns
df_cat = data[df_cat]

In [8]:
# Null variable which columns have it in.
nan_counts = df_cat.isnull().sum()
print(nan_counts)

trans_date_trans_time    0
merchant                 0
category                 0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
job                      0
dob                      0
trans_num                0
dtype: int64


In [9]:
# only choice 2 categorical features because of REASONABLENESS
df_cat = df_cat[["gender","category"]]
df_cat.head()

Unnamed: 0,gender,category
0,F,misc_net
1,F,grocery_pos
2,M,entertainment
3,M,gas_transport
4,M,misc_pos


In [10]:
# Display the unique values for the "category" column
unique_cat = df_cat["category"].unique()
print(unique_cat)

['misc_net' 'grocery_pos' 'entertainment' 'gas_transport' 'misc_pos'
 'grocery_net' 'shopping_net' 'shopping_pos' 'food_dining' 'personal_care'
 'health_fitness' 'travel' 'kids_pets' 'home']


In [11]:
# Display the number of unique values for the "category" column
unique_counts = df_cat["category"].nunique()
print(unique_counts)

14


In [12]:
# Encoding with categorical features: (I would like to use LabelEncoder)
label_encoder = LabelEncoder()
categorical_columns = ['gender', 'category']

for col in categorical_columns:
    df_cat[col] = label_encoder.fit_transform(df_cat[col])
df_cat.head()

Unnamed: 0,gender,category
0,0,8
1,0,4
2,1,0
3,1,2
4,1,9


In [13]:
#Numeric variable check and preprocessing:
df_num = data.select_dtypes(include=['number']).columns
print(df_num)
df_num = data[df_num]

df_num.head()

Index(['Unnamed: 0', 'cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop',
       'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'merch_zipcode'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
0,0,2703186189652095,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,28705.0
1,1,630423337322,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,
2,2,38859492057661,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0,83236.0
3,3,3534093764340240,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,
4,4,375534208663984,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,22844.0


In [14]:
df_num = df_num[["amt","is_fraud"]]
df_num.head()

Unnamed: 0,amt,is_fraud
0,4.97,0
1,107.23,0
2,220.11,0
3,45.0,0
4,41.96,0


In [15]:
#concate numeric & categorical dataset
df = pd.concat([df_cat, df_num], axis=1)

#Models

In [17]:
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

##RandomForest

In [18]:
model = RandomForestClassifier(n_estimators = 100, random_state=24,min_samples_leaf=4,max_depth=10,min_samples_split=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9970655715580234
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257801
           1       0.80      0.67      0.73      1534

    accuracy                           1.00    259335
   macro avg       0.90      0.83      0.86    259335
weighted avg       1.00      1.00      1.00    259335



##KNeighbors

In [20]:
model_2 = KNeighborsClassifier()
model_2.fit(X_train,y_train)
y_pred = model_2.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9965295852854416
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257801
           1       0.74      0.63      0.68      1534

    accuracy                           1.00    259335
   macro avg       0.87      0.82      0.84    259335
weighted avg       1.00      1.00      1.00    259335

