In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

In [3]:
# Load the dataset
data = pd.read_csv('Downloads/card_transdata.csv') 

In [4]:
# Explore the dataset
print(data.head())
print(data.info())

   distance_from_home  distance_from_last_transaction  \
0           57.877857                        0.311140   
1           10.829943                        0.175592   
2            5.091079                        0.805153   
3            2.247564                        5.600044   
4           44.190936                        0.566486   

   ratio_to_median_purchase_price  repeat_retailer  used_chip  \
0                        1.945940              1.0        1.0   
1                        1.294219              1.0        0.0   
2                        0.427715              1.0        0.0   
3                        0.362663              1.0        1.0   
4                        2.222767              1.0        1.0   

   used_pin_number  online_order  fraud  
0              0.0           0.0    0.0  
1              0.0           0.0    0.0  
2              0.0           1.0    0.0  
3              0.0           1.0    0.0  
4              0.0           1.0    0.0  
<class 'pandas

In [10]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
data['distance_from_home'] = imputer.fit_transform(data[['distance_from_home']])
data['distance_from_last_transaction'] = imputer.fit_transform(data[['distance_from_last_transaction']])
data['ratio_to_median_purchase_price'] = imputer.fit_transform(data[['ratio_to_median_purchase_price']])

In [12]:
# Handle duplicates
data.drop_duplicates(inplace=True)

In [14]:
# Handle outliers
scaler = RobustScaler()
data['distance_from_home'] = scaler.fit_transform(data[['distance_from_home']])
data['distance_from_last_transaction'] = scaler.fit_transform(data[['distance_from_last_transaction']])
data['ratio_to_median_purchase_price'] = scaler.fit_transform(data[['ratio_to_median_purchase_price']])

In [16]:
# Prepare features and target variable
X = data.drop('fraud', axis=1)
y = data['fraud']

In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Initialize and train the Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [24]:
# Make predictions on the test set
y_pred = clf.predict(X_test)

In [25]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [26]:
# Print the results
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Accuracy: 0.99999
Confusion Matrix:
[[182557      0]
 [     2  17441]]
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    182557
         1.0       1.00      1.00      1.00     17443

    accuracy                           1.00    200000
   macro avg       1.00      1.00      1.00    200000
weighted avg       1.00      1.00      1.00    200000

