In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [108]:
# loading the dataset into pandas dataframe
data = pd.read_csv("PS_20174392719_1491204439457_log.csv")
data

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Getting familiar with the data
data.info()

In [None]:
data.describe()

In [None]:
# checking any null values in the data
data.isnull().sum()

In [None]:
# type of variables present in teh data
obj = (data.dtypes == 'object')
object_cols = list(obj[obj].index)
print("Categorical variables:", len(object_cols))

int_ = (data.dtypes == 'int')
num_cols = list(int_[int_].index)
print("Integer variables:", len(num_cols))

fl = (data.dtypes == 'float')
fl_cols = list(fl[fl].index)
print("Float variables:", len(fl_cols))


In [None]:
sns.countplot(x='type', data=data)


In [None]:
sns.barplot(x='type', y='amount', data=data)


In [None]:
# checking the count of fraud of payments
data['isFraud'].value_counts()


In [None]:
# Separate the two kinds of data (sampling)
kind1_data = data[data['isFraud'] == 0]
kind2_data = data[data['isFraud'] == 1]

# Sample a specific number of rows from each kind
sample_size = 8000  # Adjust as needed
sampled_kind1 = kind1_data.sample(n=sample_size, random_state=42)
sampled_kind2 = kind2_data.sample(n=sample_size, random_state=42)

# Concatenate the sampled data back together if needed
data = pd.concat([sampled_kind1, sampled_kind2], ignore_index=True)

In [None]:
data['isFraud'].value_counts()

In [None]:
plt.figure(figsize=(15, 6))
sns.histplot(data['step'], bins=50,kde= True,stat="density", kde_kws=dict(cut=3))


In [None]:
# Exclude non-numeric columns from the correlation matrix calculation
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
data_numeric = data[numeric_columns]

# Calculate correlation matrix
corr_matrix = data_numeric.corr()

# Set up the figure and heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(corr_matrix,
            cmap='BrBG',
            fmt='.2f',
            linewidths=2,
            annot=True)

plt.show()

In [None]:
type_new = pd.get_dummies(data['type'], drop_first=True)
data_new = pd.concat([data, type_new], axis=1)
data_new.head()


In [None]:
X = data_new.drop(['isFraud', 'type', 'nameOrig', 'nameDest'], axis=1)
y = data_new['isFraud']
X.shape, y.shape


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Fitting different models

def get_model_name(model):
    if isinstance(model, XGBClassifier):
        return 'XGBClassifier()'
    else:
        return repr(model)

models = [LogisticRegression(), XGBClassifier(),
          SVC(kernel='rbf', probability=True),
          RandomForestClassifier(n_estimators=7,
                                 criterion='entropy',
                                 random_state=7)]

for model in models:
    model.fit(X_train, y_train)
    model_name = get_model_name(model)
    
    print(f'{model_name} :')
    
    train_preds = model.predict_proba(X_train)[:, 1]
    print('Training Accuracy:', ras(y_train, train_preds))
    
    y_preds = model.predict_proba(X_test)[:, 1]
    print('Validation Accuracy:', ras(y_test, y_preds))
    print()


In [None]:
# XGBClassifier model got highest accuracy

classifier = XGBClassifier()
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
Results = pd.DataFrame({'Actual':y_test,'Predicted':y_predict})
Results.head(10)

In [None]:
# Confusion matrix for XGBClassifier model

from sklearn.metrics import confusion_matrix


model = models[1]  

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix using Seaborn
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='BuPu', cbar=True)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
