In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
columns = [
    "date_created", "customer_ID", "external_reference", "operation_id",
    "status", "status_detail", "transaction_amount", "installments",
    "payment_type", "hour_created", "DOW_created", "month_created","shp_Carrier",
    "Shipping", "bill_zipcode", "shp_zipcode", "fraud_flag", "Item_1"
]

target = ["fraud_flag"]

In [3]:
# Load the data
file_path = Path('Resources/Whole_Collection.csv')
df = pd.read_csv(file_path)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,date_created,customer_ID,external_reference,operation_id,status,status_detail,transaction_amount,installments,payment_type,hour_created,DOW_created,month_created,shp_Carrier,Shipping,bill_zipcode,shp_zipcode,fraud_flag,Item_1
0,2022-05-17 17:02:12,2383,375285,22439390268,approved,accredited,4549.0,1,debit_card,17,1,5,DHL,139,88660,88660,No,TMG50V64GB
1,2022-05-17 17:00:42,2383,375284,22439310665,approved,accredited,4549.0,1,debit_card,17,1,5,DHL,139,88660,88660,No,TMG50V64GB
2,2022-05-17 13:23:07,2811,375213,22433076416,rejected,cc_rejected_high_risk,6378.0,3,credit_card,13,1,5,DHL,139,89350,89350,No,TMG7AZUL
3,2022-05-17 13:26:43,2811,375217,22433120457,rejected,cc_rejected_high_risk,6378.0,1,credit_card,13,1,5,DHL,139,89350,89350,No,TMG7AZUL
4,2022-05-17 15:01:25,4291,375235,22435577370,approved,accredited,6689.0,9,credit_card,15,1,5,FederalExpress,0,55067,55067,No,TMREALMEGTG


In [4]:
#droping information
df = df.drop(['date_created','external_reference','operation_id','status_detail','hour_created','DOW_created','shp_Carrier','Shipping','bill_zipcode','status','payment_type','Item_1'], axis=1)
df.head()

Unnamed: 0,customer_ID,transaction_amount,installments,month_created,shp_zipcode,fraud_flag
0,2383,4549.0,1,5,88660,No
1,2383,4549.0,1,5,88660,No
2,2811,6378.0,3,5,89350,No
3,2811,6378.0,1,5,89350,No
4,4291,6689.0,9,5,55067,No


In [5]:
# Define features set
X = df.copy()
X = X.drop("fraud_flag", axis=1)
X.head()

Unnamed: 0,customer_ID,transaction_amount,installments,month_created,shp_zipcode
0,2383,4549.0,1,5,88660
1,2383,4549.0,1,5,88660
2,2811,6378.0,3,5,89350
3,2811,6378.0,1,5,89350
4,4291,6689.0,9,5,55067


In [6]:
# Define target vector
y = df["fraud_flag"].values.reshape(-1, 1)
y[:5]

array([['No'],
       ['No'],
       ['No'],
       ['No'],
       ['No']], dtype=object)

In [12]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78) 

In [15]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  


In [16]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [17]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4817,8
Actual 1,57,24


In [18]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [19]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4817,8
Actual 1,57,24


Accuracy Score : 0.9867509172441908
Classification Report
              precision    recall  f1-score   support

          No       0.99      1.00      0.99      4825
         Yes       0.75      0.30      0.42        81

    accuracy                           0.99      4906
   macro avg       0.87      0.65      0.71      4906
weighted avg       0.98      0.99      0.98      4906



In [20]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.33138261, 0.22064085, 0.02746666, 0.08749141, 0.33301847])

In [21]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3330184678694324, 'shp_zipcode'),
 (0.3313826136352228, 'customer_ID'),
 (0.2206408465612854, 'transaction_amount'),
 (0.08749141333159882, 'month_created'),
 (0.027466658602460645, 'installments')]