<a href="https://colab.research.google.com/github/Vinay7115/EncryptixML_task/blob/main/credit_card_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load the data
cc_fraud_test_data = pd.read_csv('fraudTest.csv')
cc_fraud_train_data = pd.read_csv('fraudTrain.csv')

In [None]:
# Check for missing values
print(cc_fraud_test_data.isnull().sum())
print(cc_fraud_train_data.isnull().sum())

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               1
is_fraud                 1
dtype: int64
Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat            

In [None]:
# Drop unnecessary columns
columns_to_drop = ['trans_date_trans_time', 'merchant', 'first', 'last', 'street', 'state', 'job', 'dob', 'Unnamed: 0', 'trans_num']
cc_fraud_test_data.drop(columns=columns_to_drop, inplace=True)
cc_fraud_train_data.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Convert categorical columns to numerical columns using one-hot encoding
categorical_cols = ['category', 'gender', 'city']
cc_fraud_train_data = pd.get_dummies(cc_fraud_train_data, columns=categorical_cols)
cc_fraud_test_data = pd.get_dummies(cc_fraud_test_data, columns=categorical_cols)

In [None]:
# Ensure the test set has the same dummy variables as the train set
cc_fraud_test_data = cc_fraud_test_data.reindex(columns=cc_fraud_train_data.columns, fill_value=0)

In [None]:
# Scale numerical columns using StandardScaler
scaler = StandardScaler()
numerical_cols = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']
cc_fraud_train_data[numerical_cols] = scaler.fit_transform(cc_fraud_train_data[numerical_cols])
cc_fraud_test_data[numerical_cols] = scaler.transform(cc_fraud_test_data[numerical_cols])

In [None]:
# Separate features and target variable
x_train = cc_fraud_train_data.drop('is_fraud', axis=1)
y_train = cc_fraud_train_data['is_fraud']
x_test = cc_fraud_test_data.drop('is_fraud', axis=1)
y_test = cc_fraud_test_data['is_fraud']

In [None]:
# Check and handle missing values in the target variable
print("Missing values in y_train:", y_train.isnull().sum())
print("Missing values in y_test:", y_test.isnull().sum())

# Remove rows where the target variable is NaN
x_train = x_train[~y_train.isnull()]
y_train = y_train.dropna()
x_test = x_test[~y_test.isnull()]
y_test = y_test.dropna()

Missing values in y_train: 1
Missing values in y_test: 1


In [None]:
# Impute missing values in features
imputer = SimpleImputer(strategy='mean')
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)

In [None]:
# Train the Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Make predictions
y_pred_log_reg = log_reg.predict(x_test)

# Evaluate the model
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))

Logistic Regression:
Accuracy: 0.9969835055516334
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     15534
         1.0       0.00      0.00      0.00        47

    accuracy                           1.00     15581
   macro avg       0.50      0.50      0.50     15581
weighted avg       0.99      1.00      1.00     15581

Confusion Matrix:
[[15534     0]
 [   47     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

y_pred_rf = rf.predict(x_test)
print("Random Forest:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest:
Accuracy: 0.9968551440857455
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     15534
         1.0       0.00      0.00      0.00        47

    accuracy                           1.00     15581
   macro avg       0.50      0.50      0.50     15581
weighted avg       0.99      1.00      1.00     15581

Confusion Matrix:
[[15532     2]
 [   47     0]]


In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

y_pred_dt = dt.predict(x_test)
print("Decision Tree:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))

Decision Tree:
Accuracy: 0.9962775174892498
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     15534
         1.0       0.00      0.00      0.00        47

    accuracy                           1.00     15581
   macro avg       0.50      0.50      0.50     15581
weighted avg       0.99      1.00      1.00     15581

Confusion Matrix:
[[15523    11]
 [   47     0]]
