In [1]:
import pandas as pd

In [2]:
fraud=pd.read_csv('Fraud.csv')

In [3]:
fraud.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
print(fraud.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None


In [5]:
print(fraud.describe())

               step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338831e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888243e+06    2.924049e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.338957e+04   0.000000e+00    0.000000e+00   
50%    2.390000e+02  7.487194e+04   1.420800e+04    0.000000e+00   
75%    3.350000e+02  2.087215e+05   1.073152e+05    1.442584e+05   
max    7.430000e+02  9.244552e+07   5.958504e+07    4.958504e+07   

       oldbalanceDest  newbalanceDest       isFraud  isFlaggedFraud  
count    6.362620e+06    6.362620e+06  6.362620e+06    6.362620e+06  
mean     1.100702e+06    1.224996e+06  1.290820e-03    2.514687e-06  
std      3.399180e+06    3.674129e+06  3.590480e-02    1.585775e-03  
min      0.000000e+00    0.000000e+00  0.000000e+00    0.000000e+00  
25%      0.000000e+00    0.000000e+00

#check for missing value

In [7]:
missing_values=fraud.isnull().sum()
print(missing_values)

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [9]:
#drop missing value

In [None]:
fraud.fillna(fraud.mean(), inplace=True)

In [None]:
#handling outliers
import numpy as np

In [None]:
#detecting outliers using IQR


In [None]:
Q1 = fraud.quantile(0.25)
Q3 = fraud.quantile(0.75)
IQR = Q3 - Q1

In [None]:
#removing outliers

In [None]:
fraud = fraud[~((fraud < (Q1 - 1.5 * IQR)) | (fraud > (Q3 + 1.5 * IQR))).any(axis=1)]

#Handling Multi-Collinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature

In [None]:
X = fraud.drop(columns=['fraudulent'])
vif_fraud = pd.DataFrame()
vif_fraud["feature"] = X.columns
vif_fraud["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_fraud)

# Remove features with high VIF

In [None]:
features_to_remove = vif_fraud[vif_fraud['VIF'] > 10]['feature']
fraud.drop(columns=features_to_remove, inplace=True)

# 2.Fraud Detection Model
# Model description


In [None]:
#We'll use a Random Forest classifier, which is robust and effective for classification tasks, including fraud detection.

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)

# 3. Variable Selection

In [None]:
#Select variables based on domain knowledge, feature importance from initial models, and correlation analysis.

In [None]:
selected_features = ['feature1', 'feature2', 'feature3', 'feature4']  # Example features
X = fraud[selected_features]
y = fraud['fraudulent']

# 4. Model Performance

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [4]:
# Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
#Train model

In [None]:
model.fit(X_train, y_train)

In [None]:
# Predictions and Evaluation

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [None]:
print(classification_report(y_test, y_pred))
print('AUC-ROC:', roc_auc_score(y_test, y_prob))
print(confusion_matrix(y_test, y_pred))

# 5. Key Factors Predicting Fraudulent Customers

In [None]:
importances = model.feature_importances_
feature_names = X.columns

feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

print(feature_importance_df)