In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Step 1: Load the training dataset
train_data = pd.read_csv("fraudTrain.csv")

In [3]:
# Step 2: Display the first few rows of the dataset
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
# Step 3: Check for missing values
missing_values = train_data.isnull().sum()
missing_values

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [5]:
# Step 4: Statistics of numerical features
numerical_stats = train_data.describe()
numerical_stats

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


In [6]:
# Step 5: Checking the distribution of the target variable
fraud_distribution = train_data['is_fraud'].value_counts(normalize=True)
fraud_distribution

is_fraud
0    0.994211
1    0.005789
Name: proportion, dtype: float64

In [7]:
# Step 6: Compare columns of the train and test data
test_data = pd.read_csv("fraudTest.csv")
train_columns = set(train_data.columns)
test_columns = set(test_data.columns)
common_columns = set(train_columns).intersection(test_columns)
train_unique_columns = set(train_columns) - common_columns
test_unique_columns = set(test_columns) - common_columns

common_columns, train_unique_columns, test_unique_columns

({'Unnamed: 0',
  'amt',
  'category',
  'cc_num',
  'city',
  'city_pop',
  'dob',
  'first',
  'gender',
  'is_fraud',
  'job',
  'last',
  'lat',
  'long',
  'merch_lat',
  'merch_long',
  'merchant',
  'state',
  'street',
  'trans_date_trans_time',
  'trans_num',
  'unix_time',
  'zip'},
 set(),
 set())

In [8]:
# Step 7: Define a function for extracting datetime information
def extract_datetime_info(X):
    X['trans_date_trans_time'] = pd.to_datetime(X['trans_date_trans_time'])
    X['trans_hour'] = X['trans_date_trans_time'].dt.hour
    X['trans_day'] = X['trans_date_trans_time'].dt.day
    X['trans_month'] = X['trans_date_trans_time'].dt.month
    X['trans_year'] = X['trans_date_trans_time'].dt.year
    X['dob'] = pd.to_datetime(X['dob'])  # Convert 'dob' column to datetime format
    X['age'] = 2024 - X['dob'].dt.year
    return X.drop(columns=['trans_date_trans_time', 'dob'])

In [9]:
# Step 8: Define preprocessing steps for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
# Step 9: Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, selector(dtype_include="number")),
        ('cat', categorical_transformer, selector(dtype_include="category"))
    ])

In [11]:
# Step 10: Combine feature engineering and preprocessing steps into one pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('feature_engineering', FunctionTransformer(extract_datetime_info, validate=False)),
    ('preprocessor', preprocessor)
])

In [12]:
# Step 11: Preprocess the training data
X_processed = preprocessing_pipeline.fit_transform(train_data)

In [13]:
# Step 12: Split the preprocessed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, train_data['is_fraud'], test_size=0.2, random_state=42)

In [14]:
# Step 13: Initialize classifiers (Logistic Regression and Random Forest)
log_reg = LogisticRegression()
random_forest = RandomForestClassifier()

In [15]:
# Step 14: Train and evaluate each classifier
for clf in [log_reg, random_forest]:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf.__class__.__name__} accuracy: {accuracy}")

LogisticRegression accuracy: 1.0
RandomForestClassifier accuracy: 1.0


In [16]:
# Step 15: Fit the Random Forest classifier on the training data
random_forest.fit(X_train, y_train)

In [17]:
# Step 16: Predict on the test data
y_pred_rf = random_forest.predict(X_test)

In [18]:
# Step 17: Compute evaluation metrics
print("Random Forest Classifier Evaluation Metrics on Test Data:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classifier Evaluation Metrics on Test Data:
Confusion Matrix:
[[257815      0]
 [     0   1520]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       1.00      1.00      1.00      1520

    accuracy                           1.00    259335
   macro avg       1.00      1.00      1.00    259335
weighted avg       1.00      1.00      1.00    259335

