In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix ,roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.utils import resample
import datetime

# Load the dataset using pandas
data = pd.read_csv('creditcard.csv')
X = data.drop('Class', axis=1).values
y = data['Class'].values


# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Create subplots with shared x-axis
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 6), gridspec_kw={'height_ratios': [1, 1]})

bins = 100

# Plotting for Fraud transactions
ax1.hist(data.Time[data.Class == 1], bins=bins, color='red', alpha=0.7)
ax1.set_title('Fraud', fontsize=14)
ax1.set_ylabel('Number of Transactions', fontsize=12)
ax1.grid(True, linestyle='--', alpha=0.5)

# Set y-axis ticks with a step of 5
ax1.set_yticks(np.arange(0, ax1.get_ylim()[1], 5))

# Plotting for Normal transactions
ax2.hist(data.Time[data.Class == 0], bins=bins, color='blue', alpha=0.7)
ax2.set_title('Normal', fontsize=14)
ax2.set_xlabel('Time (in Seconds)', fontsize=12)
ax2.set_ylabel('Number of Transactions', fontsize=12)
ax2.grid(True, linestyle='--', alpha=0.5)

# Remove top and right spines
for ax in [ax1, ax2]:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

# Adjust layout to prevent clipping of y-axis labels
plt.tight_layout()
plt.show()


plt.figure(figsize=(10, 6))
plt.boxplot([data[data["Class"]==1]["Amount"], data[data["Class"]==0]["Amount"]], 
            labels=['Fraud', 'Normal'])

plt.title('Transaction Amounts: Fraud vs Normal')
plt.ylabel('Amount')
plt.yscale('log')
plt.show()

# Dataset exploring
print(data.columns)

# Print the shape of the data
data = data.sample(frac=0.1, random_state=1)
print(data.shape)
print(data.describe())

# V1 - V28 are the results of PCA Dimensionality reduction to protect user identities and sensitive features

# Plot histograms of each parameter
data.hist(figsize=(20, 20))
plt.show()

# Determine the number of fraud cases in the dataset
Fraud = data[data['Class'] == 1]
Valid = data[data['Class'] == 0]

outlier_fraction = len(Fraud) / float(len(Valid))
print(outlier_fraction)

print('Fraud Cases: {}'.format(len(data[data['Class'] == 1])))
print('Valid Transactions: {}'.format(len(data[data['Class'] == 0])))

# Correlation matrix
corrmat = data.corr()
fig = plt.figure(figsize=(12, 9))

sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()

# Get all the columns from the DataFrame
columns = data.columns.tolist()

# Filter the columns to remove data we do not want
columns = [c for c in columns if c not in ["Class"]]

# Store the variable we'll be predicting on
target = "Class"

X = data[columns]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine X_train and y_train for oversampling
train_data = pd.concat([X_train, y_train], axis=1)

# Separate minority and majority classes
minority_class = train_data[train_data['Class'] == 1]
majority_class = train_data[train_data['Class'] == 0]

# Upsample minority class using SMOTE
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combine oversampled minority class with majority class
upsampled_train_data = pd.concat([majority_class, minority_upsampled], axis=0)

# Shuffle the data
upsampled_train_data = upsampled_train_data.sample(frac=1, random_state=42)

# Split the oversampled data back into features (X_resampled) and labels (y_resampled)
X_train_resampled = upsampled_train_data.drop('Class', axis=1)
y_train_resampled = upsampled_train_data['Class']

# Define the fitness function for the genetic algorithm
def evaluate_features(features):
    clf = RandomForestClassifier(n_estimators=5, random_state=5)
    clf.fit(X_train_resampled.iloc[:, features], y_train_resampled)
    y_pred = clf.predict(X_test.iloc[:, features])
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Genetic Algorithm Configuration
population_size = 3
num_generations = 1
num_features = X_train_resampled.shape[1]
mutation_rate = 0.1

# Initialize the population with random features
population = [random.sample(range(num_features), num_features) for _ in range(population_size)]

# ...

# Genetic Algorithm
for generation in range(num_generations):
    # Evaluate the fitness of each individual in the population
    fitness_scores = [evaluate_features(individual) for individual in population]

    # Select the top-performing individuals
    num_parents = int(population_size * 0.2)
    parents = np.argsort(fitness_scores)[-num_parents:]

    # Create a new population by crossing over and mutating parents
    new_population = []

    for _ in range(population_size - num_parents):
        parent1 = random.choice(parents)
        parent2 = random.choice(parents)
        crossover_point = random.randint(0, num_features - 1)
        child = population[parent1][:crossover_point] + population[parent2][crossover_point:]

        # Apply mutation
        if random.random() < mutation_rate:
            mutated_gene = random.randint(0, num_features - 1)
            child[mutated_gene] = 1 - child[mutated_gene]

        new_population.append(child)

    # Replace the old population with the new population
    population = population[:num_parents] + new_population

    # Train the RandomForestClassifier on the best individual of this generation
    best_individual = population[np.argmax(fitness_scores)]
    clf_rf = RandomForestClassifier(n_estimators=5, random_state=5)
    clf_rf.fit(X_train_resampled.iloc[:, best_individual], y_train_resampled)

# Get the best features selected
best_features = [i for i, selected in enumerate(best_individual) if selected]

# Get feature importances from the trained RandomForestClassifier model
feature_importances = clf_rf.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances')
plt.show()


# Train and test a Decision Tree model using the best features
clf_decision_tree = DecisionTreeClassifier(random_state=42)
clf_decision_tree.fit(X_train_resampled.iloc[:, best_features], y_train_resampled)
y_pred_decision_tree = clf_decision_tree.predict(X_test.iloc[:, best_features])

# Evaluate Decision Tree model
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
precision_decision_tree = precision_score(y_test, y_pred_decision_tree)
recall_decision_tree =recall_score(y_test, y_pred_decision_tree)
f1score_decision_tree = f1_score(y_test, y_pred_decision_tree)
AUC_decision_tree = roc_auc_score(y_test, y_pred_decision_tree)
confusion_matrix_decision_tree = confusion_matrix(y_test, y_pred_decision_tree)

fpr_decision_tree, tpr_decision_tree, thresholds_decision_tree = roc_curve(y_test, y_pred_decision_tree)
roc_auc_decision_tree = auc(fpr_decision_tree, tpr_decision_tree)

print("Accuracy of Decision Tree model on Test Data:", accuracy_decision_tree)
print("Precision of Decision Tree model on Test Data:", precision_decision_tree)
print("Recall of Decision Tree model on Test Data:", recall_decision_tree)
print("F1 Score of Decision Tree model on Test Data:", f1score_decision_tree)
print("AUC of Decision Tree model on Test Data:", AUC_decision_tree)
print("Confusion Matrix of Decision Tree model on Test Data:")
print( confusion_matrix_decision_tree)


# Train and test a Logistic Regression model using the best features
clf_logistic_regression = LogisticRegression(random_state=42)
clf_logistic_regression.fit(X_train_resampled.iloc[:, best_features], y_train_resampled)
y_pred_logistic_regression = clf_logistic_regression.predict(X_test.iloc[:, best_features])

# Evaluate Logistic Regression model
accuracy_logistic_regression = accuracy_score(y_test, y_pred_logistic_regression)
precision_logistic_regression = precision_score(y_test, y_pred_logistic_regression)
recall_logistic_regression = recall_score(y_test, y_pred_logistic_regression)
f1_logistic_regression = f1_score(y_test, y_pred_logistic_regression)
auc_logistic_regression = roc_auc_score(y_test, y_pred_logistic_regression)
confusion_logistic_regression = confusion_matrix(y_test, y_pred_logistic_regression)

fpr_logistic_regression, tpr_logistic_regression, thresholds_logistic_regression = roc_curve(y_test, y_pred_logistic_regression)
roc_auc_logistic_regression = auc(fpr_logistic_regression, tpr_logistic_regression)


print("Accuracy of Decision Tree model on Test Data:", accuracy_logistic_regression)
print("Precision of Decision Tree model on Test Data:", precision_logistic_regression)
print("Recall of Decision Tree model on Test Data:", recall_logistic_regression)
print("F1 Score of Decision Tree model on Test Data:", f1_logistic_regression)
print("AUC of Decision Tree model on Test Data:", auc_logistic_regression)
print("Confusion Matrix of Decision Tree model on Test Data:")
print(confusion_logistic_regression)


# Train and test a Naive Bayes model using the best features
clf_naive_bayes = GaussianNB()
clf_naive_bayes.fit(X_train_resampled.iloc[:, best_features], y_train_resampled)
y_pred_naive_bayes = clf_naive_bayes.predict(X_test.iloc[:, best_features])

# Evaluate Naive Bayes model
accuracy_naive_bayes = accuracy_score(y_test, y_pred_naive_bayes)
precision_naive_bayes = precision_score(y_test, y_pred_naive_bayes)
recall_naive_bayes = recall_score(y_test, y_pred_naive_bayes)
f1_naive_bayes = f1_score(y_test, y_pred_naive_bayes)
auc_naive_bayes = roc_auc_score(y_test, y_pred_naive_bayes)
confusion_naive_bayes = confusion_matrix(y_test, y_pred_naive_bayes)

fpr_naive_bayes, tpr_naive_bayes, thresholds_naive_bayes = roc_curve(y_test, y_pred_naive_bayes)
roc_auc_naive_bayes = roc_auc_score(y_test, y_pred_naive_bayes)

print("Metrics for Naive Bayes Model:")
print("Accuracy:", accuracy_naive_bayes)
print("Precision:", precision_naive_bayes)
print("Recall:", recall_naive_bayes)
print("F1 Score:", f1_naive_bayes)
print("AUC:", auc_naive_bayes)
print("Confusion Matrix:")
print(confusion_naive_bayes)


# Train and test a Random Forest model using the best features
clf_random_forest = RandomForestClassifier(random_state=42)
clf_random_forest.fit(X_train_resampled.iloc[:, best_features], y_train_resampled)
y_pred_random_forest = clf_random_forest.predict(X_test.iloc[:, best_features])

# Evaluate Random Forest model
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
precision_random_forest = precision_score(y_test, y_pred_random_forest)
recall_random_forest = recall_score(y_test, y_pred_random_forest)
f1_random_forest = f1_score(y_test, y_pred_random_forest)
auc_random_forest = roc_auc_score(y_test, y_pred_random_forest)
confusion_random_forest = confusion_matrix(y_test, y_pred_random_forest)

fpr_random_forest, tpr_random_forest, thresholds_random_forest = roc_curve(y_test, y_pred_random_forest)
roc_auc_random_forest = roc_auc_score(y_test, y_pred_random_forest)


print("Metrics for Random Forest Model:")
print("Accuracy:", accuracy_random_forest)
print("Precision:", precision_random_forest)
print("Recall:", recall_random_forest)
print("F1 Score:", f1_random_forest)
print("AUC:", auc_random_forest)
print("Confusion Matrix:")
print(confusion_random_forest)

plt.figure(figsize=(8, 8))
plt.plot(fpr_decision_tree, tpr_decision_tree, color='darkorange', lw=2, label='Decision Tree (AUC = {:.2f})'.format(roc_auc_decision_tree))
plt.plot(fpr_logistic_regression, tpr_logistic_regression, color='darkgreen', lw=2, label='Logistic Regression (AUC = {:.2f})'.format(roc_auc_logistic_regression))
plt.plot(fpr_naive_bayes, tpr_naive_bayes, color='purple', lw=2, label='Naive Bayes (AUC = {:.2f})'.format(roc_auc_naive_bayes))
plt.plot(fpr_random_forest, tpr_random_forest, color='blue', lw=2, label='Random Forest (AUC = {:.2f})'.format(roc_auc_random_forest))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


import seaborn as sns

# Assuming 'Time' is the feature representing seconds elapsed since the first transaction
data['Time'] = pd.to_datetime(data['Time'], unit='s')

# Extract 'Hour' feature
data['Hour'] = data['Time'].dt.hour

# KDE Plot for Hour
plt.figure(figsize=(8, 6))
sns.kdeplot(data.loc[data['Class'] == 0, 'Hour'], label='Valid', shade=True, color='green')
sns.kdeplot(data.loc[data['Class'] == 1, 'Hour'], label='Fraud', shade=True, color='red')
plt.title('KDE Plot for Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Density')
plt.legend()
plt.show()

