In [1]:
#1. Data Collection and Preprocessing
"""
1. Data Collection and Preprocessing
Start by loading the dataset and preprocessing it. For demonstration purposes, 
let's assume you're using a dataset like the one from Kaggle's Credit Card Fraud Detection dataset.
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
# Load the data
    #data = load_data(r"C:\Users\ADMIN\Data Analytics Projects\Student Perfomance Prediction Project\student_data.csv")

data = pd.read_csv(r"C:\Users\ADMIN\Data Analytics Projects\Credit Card Fraud Detection\creditcard.csv")#('creditcard.csv')

# Separate features and labels
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

"""2. Exploratory Data Analysis (EDA)
Performing EDA to understand the data distribution, correlations, and identifying any anomalies."""
import seaborn as sns
import matplotlib.pyplot as plt

# Plot class distribution
sns.countplot(y)
plt.title('Class Distribution')
plt.show()

# Plotting some features
sns.pairplot(data[['V1', 'V2', 'V3', 'Class']], hue='Class')
plt.show() 


"""3. Feature Engineering
Enhancing features to improve model performance."""

# Example of creating new features or modifying existing ones
data['V1_V2_ratio'] = data['V1'] / (data['V2'] + 1e-6)  # Avoid division by zero
# You can add more such features based on domain knowledge

"""
4. Model Development
Using Logistic Regression as the base model.
"""

from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


"""
5. Model Evaluation and Tuning
Evaluating the model's performance and tuning hyperparameters.
"""

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


"""6. Addressing Class Imbalance
Using under-sampling and ensemble techniques."""
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedBaggingClassifier

# Under-sampling
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

# Ensemble technique
bbc = BalancedBaggingClassifier(base_estimator=LogisticRegression(max_iter=1000),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=42)
bbc.fit(X_res, y_res)

# Evaluate ensemble model
y_pred_ensemble = bbc.predict(X_test)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
conf_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)
class_report_ensemble = classification_report(y_test, y_pred_ensemble)

print(f'Ensemble Model Accuracy: {accuracy_ensemble}')
print('Ensemble Model Confusion Matrix:')
print(conf_matrix_ensemble)
print('Ensemble Model Classification Report:')
print(class_report_ensemble)


"""7. Optimization and Deployment
Further optimizing model efficiency and preparing for deployment."""

from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning
param_grid = {
    'base_estimator__C': [0.01, 0.1, 1, 10, 100],
    'n_estimators': [10, 50, 100]
}

grid_search = GridSearchCV(estimator=bbc, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_res, y_res)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Score: {best_score}')



KeyError: "['Class'] not found in axis"

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('prev_creditcard.csv')

# Separate features and labels
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Categorical features
categorical_features = ['gender', 'transaction_location', 'transaction_type', 'card_present', 'repeat_customer', 'merchant_category', 'previous_fraud', 'device_type', 'internet_access']

# Numeric features
numeric_features = ['transaction_time', 'transaction_amount', 'age', 'account_balance', 'amount_time_interaction', 'age_balance_interaction']

# Preprocessing pipeline for numeric and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(max_iter=1000))])

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Under-sampling
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

# Ensemble technique
bbc = BalancedBaggingClassifier(base_estimator=LogisticRegression(max_iter=1000),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=42)
bbc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('ensemble', bbc)])

bbc_pipeline.fit(X_res, y_res)

# Evaluate ensemble model
y_pred_ensemble = bbc_pipeline.predict(X_test)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
conf_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)
class_report_ensemble = classification_report(y_test, y_pred_ensemble)

print(f'Ensemble Model Accuracy: {accuracy_ensemble}')
print('Ensemble Model Confusion Matrix:')
print(conf_matrix_ensemble)
print('Ensemble Model Classification Report:')
print(class_report_ensemble)

# Hyperparameter tuning
param_grid = {
    'ensemble__base_estimator__C': [0.01, 0.1, 1, 10, 100],
    'ensemble__n_estimators': [10, 50, 100]
}

grid_search = GridSearchCV(estimator=bbc_pipeline, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_res, y_res)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Score: {best_score}')


Accuracy: 0.99655
Confusion Matrix:
[[19373    33]
 [   36   558]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19406
           1       0.94      0.94      0.94       594

    accuracy                           1.00     20000
   macro avg       0.97      0.97      0.97     20000
weighted avg       1.00      1.00      1.00     20000



TypeError: BalancedBaggingClassifier.__init__() got an unexpected keyword argument 'base_estimator'

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('creditcard.csv')

# Separate features and labels
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Categorical features
categorical_features = ['gender', 'transaction_location', 'transaction_type', 'card_present', 'repeat_customer', 'merchant_category', 'previous_fraud', 'device_type', 'internet_access']

# Numeric features
numeric_features = ['transaction_time', 'transaction_amount', 'age', 'account_balance', 'amount_time_interaction', 'age_balance_interaction']

# Preprocessing pipeline for numeric and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(max_iter=1000))])

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Under-sampling
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

# Ensemble technique
bbc = BalancedBaggingClassifier(estimator=LogisticRegression(max_iter=1000),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=42)
bbc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('ensemble', bbc)])

bbc_pipeline.fit(X_res, y_res)

# Evaluate ensemble model
y_pred_ensemble = bbc_pipeline.predict(X_test)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
conf_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)
class_report_ensemble = classification_report(y_test, y_pred_ensemble)

print(f'Ensemble Model Accuracy: {accuracy_ensemble}')
print('Ensemble Model Confusion Matrix:')
print(conf_matrix_ensemble)
print('Ensemble Model Classification Report:')
print(class_report_ensemble)

# Hyperparameter tuning
param_grid = {
    'ensemble__estimator__C': [0.01, 0.1, 1, 10, 100],
    'ensemble__n_estimators': [10, 50, 100]
}

grid_search = GridSearchCV(estimator=bbc_pipeline, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_res, y_res)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Score: {best_score}')


Accuracy: 0.99655
Confusion Matrix:
[[19373    33]
 [   36   558]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19406
           1       0.94      0.94      0.94       594

    accuracy                           1.00     20000
   macro avg       0.97      0.97      0.97     20000
weighted avg       1.00      1.00      1.00     20000

Ensemble Model Accuracy: 0.98385
Ensemble Model Confusion Matrix:
[[19083   323]
 [    0   594]]
Ensemble Model Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     19406
           1       0.65      1.00      0.79       594

    accuracy                           0.98     20000
   macro avg       0.82      0.99      0.89     20000
weighted avg       0.99      0.98      0.99     20000

Best Parameters: {'ensemble__estimator__C': 100, 'ensemble__n_estimators': 10}
Best Score: 0.9941818166117231


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('updated_creditcard.csv')

# Separate features and labels
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Categorical features
categorical_features = ['gender', 'transaction_location', 'transaction_type', 'card_present', 'repeat_customer', 'merchant_category', 'previous_fraud', 'device_type', 'internet_access']

# Numeric features
numeric_features = ['transaction_time', 'transaction_amount', 'age', 'account_balance', 'amount_time_interaction', 'age_balance_interaction']

# Preprocessing pipeline for numeric and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(max_iter=1000))])

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Under-sampling
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

# Ensemble technique
bbc = BalancedBaggingClassifier(estimator=LogisticRegression(max_iter=1000),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=42)
bbc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('ensemble', bbc)])

bbc_pipeline.fit(X_res, y_res)

# Evaluate ensemble model
y_pred_ensemble = bbc_pipeline.predict(X_test)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
conf_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)
class_report_ensemble = classification_report(y_test, y_pred_ensemble)

print(f'Ensemble Model Accuracy: {accuracy_ensemble}')
print('Ensemble Model Confusion Matrix:')
print(conf_matrix_ensemble)
print('Ensemble Model Classification Report:')
print(class_report_ensemble)

# Hyperparameter tuning
param_grid = {
    'ensemble__estimator__C': [0.01, 0.1, 1, 10, 100],
    'ensemble__n_estimators': [10, 50, 100]
}

grid_search = GridSearchCV(estimator=bbc_pipeline, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_res, y_res)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Score: {best_score}')


Accuracy: 0.96725
Confusion Matrix:
[[18798   199]
 [  456   547]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     18997
           1       0.73      0.55      0.63      1003

    accuracy                           0.97     20000
   macro avg       0.85      0.77      0.80     20000
weighted avg       0.96      0.97      0.96     20000

Ensemble Model Accuracy: 0.91295
Ensemble Model Confusion Matrix:
[[17309  1688]
 [   53   950]]
Ensemble Model Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.91      0.95     18997
           1       0.36      0.95      0.52      1003

    accuracy                           0.91     20000
   macro avg       0.68      0.93      0.74     20000
weighted avg       0.97      0.91      0.93     20000

Best Parameters: {'ensemble__estimator__C': 0.1, 'ensemble__n_estimators': 100}
Best Score: 0.9293219870679499


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('NUcreditcard.csv')

# Separate features and labels
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Categorical features
categorical_features = ['gender', 'transaction_location', 'transaction_type', 'card_present', 'repeat_customer', 'merchant_category', 'previous_fraud', 'device_type', 'internet_access']

# Numeric features
numeric_features = ['transaction_time', 'transaction_amount', 'age', 'account_balance', 'amount_time_interaction', 'age_balance_interaction']

# Preprocessing pipeline for numeric and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(max_iter=1000))])

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Under-sampling
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

# Ensemble technique
bbc = BalancedBaggingClassifier(estimator=LogisticRegression(max_iter=1000),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=42)
bbc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('ensemble', bbc)])

bbc_pipeline.fit(X_res, y_res)

# Evaluate ensemble model
y_pred_ensemble = bbc_pipeline.predict(X_test)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
conf_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)
class_report_ensemble = classification_report(y_test, y_pred_ensemble)

print(f'Ensemble Model Accuracy: {accuracy_ensemble}')
print('Ensemble Model Confusion Matrix:')
print(conf_matrix_ensemble)
print('Ensemble Model Classification Report:')
print(class_report_ensemble)

# Hyperparameter tuning
param_grid = {
    'ensemble__estimator__C': [0.01, 0.1, 1, 10, 100],
    'ensemble__n_estimators': [10, 50, 100]
}

grid_search = GridSearchCV(estimator=bbc_pipeline, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_res, y_res)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Score: {best_score}')


Accuracy: 0.92
Confusion Matrix:
[[18306    84]
 [ 1516    94]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     18390
           1       0.53      0.06      0.11      1610

    accuracy                           0.92     20000
   macro avg       0.73      0.53      0.53     20000
weighted avg       0.89      0.92      0.89     20000

Ensemble Model Accuracy: 0.7239
Ensemble Model Confusion Matrix:
[[13265  5125]
 [  397  1213]]
Ensemble Model Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.72      0.83     18390
           1       0.19      0.75      0.31      1610

    accuracy                           0.72     20000
   macro avg       0.58      0.74      0.57     20000
weighted avg       0.91      0.72      0.79     20000

Best Parameters: {'ensemble__estimator__C': 0.1, 'ensemble__n_estimators': 100}
Best Score: 0.7285602503912363
