In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV


In [None]:
data = pd.read_csv("/content/card_transdata.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.shape


In [None]:
data.describe()

In [None]:
data.dtypes

Cleaning Dataset(Missing values and Duplicates)

In [None]:

missing_values = data.isnull().any(axis=1)
print("Rows with Missing Values:")
print(missing_values)

In [None]:
duplicate_rows = data[data.duplicated()]
print("Deuplicated Rows:")
print(duplicate_rows)

To remove Missing Values

In [None]:
data.dropna(axis=0, inplace=True)

To remove Duplicate Values

In [None]:
data.drop_duplicates(inplace=True)

Evaluating Security of Chip and Pin Transactions


In [None]:
chippindf = data[["used_chip","used_pin_number","fraud"]]

In [None]:
total_transactions = len(chippindf)
total_fraud = chippindf["fraud"].sum()
fraud_by_chip = chippindf[chippindf["used_chip"]==1]["fraud"].sum()
fraud_by_pin = chippindf[chippindf["used_pin_number"]==1]["fraud"].sum()


In [None]:
print("Total transactions:", total_transactions)
print("Total fraud cases:", total_fraud)
print("Fraud cases using chip: {} out of {}".format(fraud_by_chip,total_transactions))
print("Fraud cases using pin: {} out of {}".format(fraud_by_pin,total_transactions))

In [None]:
labels_chip = ["Non-Fraud","Fraud"]
sizes_chip = [total_transactions - fraud_by_chip,fraud_by_chip]
colors_chip = ["lightskyblue", "lightcoral"]
labels_pin = ["Non-Fraud","Fraud"]
sizes_pin = [total_transactions - fraud_by_pin,fraud_by_pin]
colors_pin = ["lightskyblue", "lightcoral"]
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.pie(sizes_chip,labels=labels_chip, colors=colors_chip, startangle=140)
plt.axis("equal")
plt.title("Chip Transactions")
plt.subplot(1,2,2)
plt.pie(sizes_pin,labels=labels_pin, colors=colors_pin, startangle=140)
plt.axis("equal")
plt.title("Pin Transactions")
plt.suptitle("Fraud cases in Chip and pin transaction")
plt.show

Finding Correlation Between Transaction Amount & Fraud

In [None]:
correlation_df = data[["ratio_to_median_purchase_price","fraud"]]

In [None]:
correlation = correlation_df["ratio_to_median_purchase_price"].corr(correlation_df["fraud"])
print(f"Correlation between transaction amount and fraud:{correlation}")


In [None]:
avgnonfraudtransaction = correlation_df[correlation_df["fraud"]==0]["ratio_to_median_purchase_price"].mean()
avgfraudtransaction = correlation_df[correlation_df["fraud"]==1]["ratio_to_median_purchase_price"].mean()
print(f"Average ratio to median purchase price for non frudelent transactions: {avgnonfraudtransaction}")
print(f"Average ratio to median purchase price for frudelent transactions: {avgfraudtransaction}")

In [None]:
categories = ["Non-fraudulent","Fraudulent"]
average_ratio = [avgnonfraudtransaction,avgfraudtransaction]
plt.bar(categories,average_ratio,color=['blue','red'])
plt.title("Ratio to Median Purchase Price")
plt.xlabel("Fraud Category")
plt.ylabel("Average ratio to median purchase price")
plt.show()

 Analyzing Fraud Cases in Online Transaction


In [None]:
online_order_df = data[["online_order","fraud"]]

In [None]:
total_online_orders = online_order_df["online_order"].sum()
total_online_fraud = online_order_df[(online_order_df["fraud"]==1)&(online_order_df["online_order"]==1)]["fraud"].count()
fraud_rate_online = total_online_fraud/total_online_orders
total_offline_orders = len(online_order_df) - total_online_orders
total_offline_fraud = online_order_df[(online_order_df["fraud"]==1)&(online_order_df["online_order"]==0)]["fraud"].count()
fraud_rate_offline = total_offline_fraud/total_offline_orders
print(f"Fraud rate for online transactions: {fraud_rate_online:.2%} ({total_online_fraud} cases out of {total_online_orders} online transactions)")
print(f"Fraud rate for offline transactions: {fraud_rate_offline:.2%} ({total_offline_fraud} cases out of {total_offline_orders} offline transactions)")

 Feature Selection with Random Forest

In [None]:
X = data.drop("fraud",axis=1)
y = data["fraud"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
nan_indices = y_train[y_train.isna()].index
# Drop these rows from both X_train and y_train
X_train = X_train.drop(nan_indices)
y_train = y_train.drop(nan_indices)
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train,y_train)
feature_importances = pd.Series(rf_classifier.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Ranked Feature Importance:")
print(feature_importances)

 Building Credit Card Fraud Detection Model with Random Forest

In [None]:
new_transaction_features = data.sample(1).drop('fraud',axis=1)
print("\nRandomly sampled features for new transaction:")
print(new_transaction_features)
prediction = rf_classifier.predict(new_transaction_features)
print("\nPrediction for new transaction:")
print("Fraud" if prediction[0] == 1 else "Legitimate")

In [None]:
new_transaction_features1 = pd.DataFrame({
    'distance_from_home': [7],
    'distance_from_last_transaction': [3],
    'ratio_to_median_purchase_price': [0.1],
    'repeat_retailer': [0],
    'used_chip': [1],
    'used_pin_number': [0],
    'online_order': [0]
})
prediction = rf_classifier.predict(new_transaction_features1)
print("Prediction for new transaction: " + ("Fraud" if prediction[0] == 1 else "Legitimate"))

 Building Credit Card Fraud Detection Model with Logistic Regression

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
logreg_classifier = LogisticRegression(max_iter=1000, random_state=42)
logreg_classifier.fit(X_train_scaled,y_train)
new_transaction_features1 = pd.DataFrame({
    'distance_from_home': [89],
    'distance_from_last_transaction': [15],
    'ratio_to_median_purchase_price': [2.3],
    'repeat_retailer': [1],
    'used_chip': [0],
    'used_pin_number': [1],
    'online_order': [1]
})
prediction = logreg_classifier.predict(scaler.transform(new_transaction_features1))
print("Prediction for new transaction: " + ("Fraud" if prediction[0] == 1 else "Legitimate"))

Building Credit Card Fraud Detection Model with Support Vector Machine

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Remove rows where y is NaN
nan_indices_y = y[y.isna()].index
if not nan_indices_y.empty:
    X_scaled = np.delete(X_scaled, nan_indices_y, axis=0)
    y = y.drop(nan_indices_y)

In [None]:
svm_classifier = SVC(kernel="linear", probability=False, random_state=42)
svm_classifier.fit(X_scaled, y)

calibrated_svm = CalibratedClassifierCV(svm_classifier, cv="prefit")
calibrated_svm.fit(X_scaled, y)
distance_from_home = float(input("Enter Distance From Home: "))
distance_from_last_transaction = float(input("Enter Distance From Last Transaction: "))
ratio_to_median_purchase_price = float(input("Enter Ratio to Median Purchase Price: "))
repeat_retailer = int(input("Enter Repeat Retailer (0 or 1): "))
used_chip = int(input("Enter Used Chip (0 or 1): "))
used_pin_number = int(input("Enter Used Pin Number (0 or 1): "))
online_order = int(input("Enter Online Order (0 or 1): "))

new_transaction_features = pd.DataFrame({
    'distance_from_home': [distance_from_home],
    'distance_from_last_transaction': [distance_from_last_transaction],
    'ratio_to_median_purchase_price': [ratio_to_median_purchase_price],
    'repeat_retailer': [repeat_retailer],
    'used_chip': [used_chip],
    'used_pin_number': [used_pin_number],
    'online_order': [online_order]
})

scaled_transaction = scaler.transform(new_transaction_features)
prediction = calibrated_svm.predict(scaled_transaction)
probability_of_fraud = calibrated_svm.predict_proba(scaled_transaction)[:,1][0]

print("Prediction for new transaction: " + ("Fraud" if prediction[0] == 1 else "Legitimate"))
print(f"Probability of fraud: {probability_of_fraud:.4f}")



Evaluating Model Performance with Precision, Recall, and F1 Score

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

y_pred = logreg_classifier.predict(X_test_scaled)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nEvaluation Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"f1 score: {f1:.4f}")
print(f"accuracy: {accuracy:.4f}")
