<a href="https://colab.research.google.com/github/SHARKZTECH/INTRO-TO-ML/blob/main/Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [None]:
from google.colab import drive
drive.mount("/content/drive")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 1: Importing the Dataset
# Upload the dataset file to your Colab environment
# For example, if the dataset is named "credit_card_dataset.csv":
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/creditcard.csv")


In [None]:
# Step 3: Data Preprocessing
# Count the number of empty columns
empty_columns = data.columns[data.isnull().any()].tolist()
print("Number of empty columns:", len(empty_columns))

# Drop columns with empty values
data = data.dropna(axis=1, how='any')

Number of empty columns: 0


In [None]:


# Assuming the dataset has been preprocessed and split into features (X) and labels (y)
X = data.drop("Class", axis=1)
y = data["Class"]

# Split the data into training, validation, and testing sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Apply standardization to numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:


# Step 4: Model Selection
model = LogisticRegression()

# Step 5: Model Training
model.fit(X_train, y_train)

# Step 6: Model Evaluation
# Evaluate the model on the validation set
y_val_pred = model.predict(X_val)
print(classification_report(y_val, y_val_pred))

# Step 7: Testing
# Evaluate the model on the testing set
y_test_pred = model.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     42644
           1       0.86      0.57      0.69        77

    accuracy                           1.00     42721
   macro avg       0.93      0.79      0.84     42721
weighted avg       1.00      1.00      1.00     42721

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     42663
           1       0.89      0.71      0.79        59

    accuracy                           1.00     42722
   macro avg       0.95      0.86      0.90     42722
weighted avg       1.00      1.00      1.00     42722



# Testing diff algo
In this code, three different algorithms are selected and trained: Logistic Regression, Random Forest, and Support Vector Machines (SVC). Each model is trained and evaluated using the validation set. The evaluation metrics include accuracy, precision, recall, and F1-score.

After evaluating all the models, you can select the best model based on the validation performance. The code assumes you have a mechanism in place for selecting the best model (e.g., based on the highest F1-score). Finally, the best model is evaluated on the testing set, and the evaluation metrics are printed.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Importing the Dataset
from google.colab import drive
drive.mount("/content/drive")

data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/creditcard.csv")

# Step 3: Data Preprocessing

# ... (Code for data preprocessing)

# Assuming the dataset has been preprocessed and split into features (X) and labels (y)
X = data.drop("Class", axis=1)
y = data["Class"]

# Split the data into training, validation, and testing sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Apply standardization to numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Step 4: Model Selection and Training
models = [
    RandomForestClassifier(),
    LogisticRegression(),
    SVC()
]

for model in models:
    model.fit(X_train, y_train)

    # Step 6: Model Evaluation
    # Evaluate the model on the validation set
    y_val_pred = model.predict(X_val)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)

    # Print the evaluation metrics
    print("Model:", model.__class__.__name__)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print()

# Step 7: Testing
# Step 7: Selecting the Best Model
best_model = None
best_f1 = 0.0

for model in models:
    y_val_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_val_pred)

    if f1 > best_f1:
        best_f1 = f1
        best_model = model

# Step 8: Testing
# Evaluate the best model on the testing set
y_test_pred = best_model.predict(X_test)

# Calculate evaluation metrics on the testing set
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

# Print the evaluation metrics on the testing set
print("Best Model (Testing Set):", best_model.__class__.__name__)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model: RandomForestClassifier
Accuracy: 0.9995084384728822
Precision: 0.9375
Recall: 0.7792207792207793
F1-score: 0.851063829787234

Model: LogisticRegression
Accuracy: 0.9990636923292994
Precision: 0.8627450980392157
Recall: 0.5714285714285714
F1-score: 0.6875

Model: SVC
Accuracy: 0.9992041384799045
Precision: 0.9387755102040817
Recall: 0.5974025974025974
F1-score: 0.73015873015873

Best Model (Testing Set): RandomForestClassifier
Accuracy: 0.9997425214175366
Precision: 0.9444444444444444
Recall: 0.864406779661017
F1-score: 0.9026548672566371


In [None]:
import pickle

# Assuming 'best_model' is the trained RandomForestClassifier model

# Save the model to a file
filename = 'fraud_detection_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(best_model, file)


In [None]:
# Load the saved model from file
with open("/content/drive/MyDrive/Colab Notebooks/fraud_detection_model.pkl", 'rb') as file:
    loaded_model = pickle.load(file)


In [None]:
threshold = 0.5  # Adjust the threshold based on your requirements

import numpy as np

# Sample transaction features
sample_transaction = np.array([0.5, -0.3, 1.2, 0.8, -0.5, 0.2, -0.1, 0.3, 0.2, 0.4, -0.5, 0.7, -0.2, 0.1, 0.2, -0.4, 0.6, 0.2, 0.3, -0.1, 100.0])

# Add dummy values for the missing features
missing_features = 30 - len(sample_transaction)
dummy_values = np.zeros(missing_features)
sample_transaction = np.concatenate((sample_transaction, dummy_values))

# Reshape the transaction features to match the model's input shape
sample_transaction = sample_transaction.reshape(1, -1)

# Make predictions using the loaded model
prediction_probabilities = loaded_model.predict_proba(sample_transaction)

# Compare the prediction probabilities with the threshold
if prediction_probabilities[0][1] >= threshold:
    # Transaction classified as potentially fraudulent
    print("Potential Fraudulent Transaction")
else:
    # Transaction classified as legitimate
    print("Legitimate Transaction")


Legitimate Transaction
