In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/content/AIML Dataset.csv')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.info()

In [None]:

print(data['isFraud'].dtype)

In [None]:
data['isFraud'] = data['isFraud'].astype('Int64')
data['isFlaggedFraud'] = data['isFlaggedFraud'].astype('Int64')

In [None]:
data.info()

In [None]:
data['isFraud'].value_counts()

In [None]:
data['isFlaggedFraud'].value_counts()

In [None]:
data.isnull().sum()

In [None]:
data['newbalanceDest'].fillna(0, inplace = True)
data['isFraud'].fillna(0, inplace = True)
data['isFlaggedFraud'].fillna(0, inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
round(data['isFraud'].value_counts()[1] / data.shape[0] * 100,2)

In [None]:
data.shape[0]

In [None]:
data['isFraud'].value_counts()

In [None]:
107 / 69858 * 100

In [None]:
data['type'].value_counts().sort_values(ascending=False).plot(kind='bar', title='Transaction Type', color='skyblue')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.show()

In [None]:
fraud_by_type = data.groupby('type')['isFraud'].mean().sort_values(ascending=False)
fraud_by_type.plot(kind='bar',title='Fraud rate by type', color = 'salmon')
plt.xlabel('Transaction Type')
plt.ylabel('Fraud rate')
plt.show()

In [None]:
print(fraud_by_type)

In [None]:
data['amount'].describe().astype(int)

In [None]:
sns.histplot(np.log1p(data['amount']), bins=100, kde = True, color = 'red')
plt.title('Transaction Amount Distribution (log scale)')
plt.xlabel('Transaction Amount (log scale)')
plt.ylabel('count')
plt.show()

In [None]:

plt.figure(figsize=(14, 5))

# BEFORE: Raw amount distribution
plt.subplot(1, 2, 1)
sns.histplot(data['amount'], bins=100, kde=True, color='red')
plt.title('Raw Amount Distribution')
plt.xlabel('Amount')

# AFTER: Log-transformed amount distribution
plt.subplot(1, 2, 2)
sns.histplot(np.log1p(data['amount']), bins=100, kde=True, color='green')
plt.title('Log-Transformed Amount Distribution')
plt.xlabel('log(1 + Amount)')

plt.tight_layout()
plt.show()


In [None]:
sns.boxplot(data[data['amount'] < 50000], x='isFraud', y='amount')
plt.title('Amount vs isFraud fileterd under 50,000')
plt.show()

In [None]:
data['balanceDiffOrig'] = data['oldbalanceOrg'] - data['newbalanceOrig']
data['balanceDiffDest'] = data['oldbalanceDest'] - data['newbalanceDest']

In [None]:
data['balanceDiffOrig'] < 0

In [None]:
(data['balanceDiffOrig'] < 0).sum()

In [None]:
(data['balanceDiffDest'] < 0).sum()

In [None]:
top_senders = data['nameOrig'].value_counts().head(10)
top_receviers = data['nameDest'].value_counts().head(10)

In [None]:
fraud_users = data[data['isFraud'] == 1].value_counts().head(10)
fraud_users

In [None]:
fraud_type = data[data['type'].isin(['TRANSFER', 'CASH_OUT'])]
fraud_type.head()

In [None]:
fraud_type['type'].value_counts()

In [None]:
plt.figure(figsize = (8,8))
sns.countplot(fraud_type['type'])
plt.title('Fraud Distribution in Transfer & Cash_out')
plt.show()

In [None]:
data.columns

In [None]:
corr = data[['amount','oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud']].corr()
corr

In [None]:
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('correlation matrix')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
data.head()

In [None]:
df_model = data.drop(['step'], axis=1, inplace = True)

In [None]:
data.head()

In [None]:
categorical = ['type']
numerical = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']


In [None]:
Y = data['isFraud']
X = data.drop('isFraud', axis=1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, stratify=Y)


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(drop='first'), categorical)
    ],
    remainder = 'drop'
)

Model Pipeline creating

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf',LogisticRegression(class_weight='balanced', max_iter=1000))
])

In [None]:
pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

Below Accuracy is 81%

In [None]:
pipeline.score(x_test, y_test) * 100

When below joblib is runed new file is created with mentiend name

In [None]:
import joblib
joblib.dump(pipeline,'fraud_detection_pipeline.pkl')