In [8]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Load the dataset
train_data = pd.read_csv('fraudTrain.csv')
test_data = pd.read_csv('fraudTest.csv')

# Exploratory data analysis
print(train_data.head())
print(train_data.info())
print(train_data.describe())

# Check for missing values
print(train_data.isnull().sum())

# Data preprocessing
# Drop any rows with missing values
train_data = train_data.dropna()


   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [12]:
# Feature selection
features = ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long']
X = train_data[features]
y = train_data['is_fraud']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify non-numeric columns in training data
non_numeric_columns_train = X_train.select_dtypes(exclude=np.number).columns

# Drop non-numeric columns from X_train
X_train = X_train.drop(columns=non_numeric_columns_train)

# Identify non-numeric columns in test data
non_numeric_columns_test = X_test.select_dtypes(exclude=np.number).columns

# Drop non-numeric columns from X_test
X_test = X_test.drop(columns=non_numeric_columns_test)

# Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

# Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Model evaluation
print("Logistic Regression:")
print(confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))
print("Accuracy:", accuracy_score(y_test, lr_pred))

print("\nDecision Tree:")
print(confusion_matrix(y_test, dt_pred))
print(classification_report(y_test, dt_pred))
print("Accuracy:", accuracy_score(y_test, dt_pred))

print("\nRandom Forest:")
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))
print("Accuracy:", accuracy_score(y_test, rf_pred))


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression:
[[87053     0]
 [  559     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     87053
         1.0       0.00      0.00      0.00       559

    accuracy                           0.99     87612
   macro avg       0.50      0.50      0.50     87612
weighted avg       0.99      0.99      0.99     87612

Accuracy: 0.9936195954892024

Decision Tree:
[[86751   302]
 [  270   289]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     87053
         1.0       0.49      0.52      0.50       559

    accuracy                           0.99     87612
   macro avg       0.74      0.76      0.75     87612
weighted avg       0.99      0.99      0.99     87612

Accuracy: 0.9934712139889513

Random Forest:
[[87002    51]
 [  267   292]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     87053
         1.0       0.85      0.52      0.65       559

    accuracy                  