In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import glob
import numpy as np

# Define the path to the datasets
files = glob.glob('../data/splited_data/*.csv')

# Drop unnecessary columns
COLUMNS_TO_DROP = [
    'Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last',
    'street', 'city', 'state', 'zip', 'dob', 'trans_num'
]

# Create an empty DataFrame to store the concatenated data
combined_data = pd.DataFrame()

# Iterate over each file and combine data
for file in files:
    data = pd.read_csv(file)
    clean_data = data.drop(columns=COLUMNS_TO_DROP)
    combined_data = pd.concat([combined_data, clean_data], ignore_index=True)

# Separate features and target
X = combined_data.drop('is_fraud', axis=1)
y = combined_data['is_fraud']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=True, handle_unknown='ignore'), categorical_features)
    ]
)

# Build the model pipeline with Logistic Regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced', max_iter=100))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)



Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.88      0.93    644585
           1       0.04      0.77      0.07      3753

    accuracy                           0.88    648338
   macro avg       0.52      0.82      0.50    648338
weighted avg       0.99      0.88      0.93    648338



In [4]:

# Define the path to the test data
test_file_path = '../data/fraudTest.csv'
pipeline = model
# Process test data in chunks
chunk_size = 50000  # Adjust chunk size based on memory constraints
y_test_all = []
y_pred_all = []
fraudulent_predictions = []

for chunk in pd.read_csv(test_file_path, chunksize=chunk_size):
    # Drop unnecessary columns (same as training data)
    chunk_cleaned = chunk.drop(columns=COLUMNS_TO_DROP, errors='ignore')
    
    # Ensure the same feature columns as used in training
    feature_columns = pipeline.named_steps['preprocessor'].transformers_[0][2] + \
                      pipeline.named_steps['preprocessor'].transformers_[1][2]
    X_test = chunk_cleaned[feature_columns]
    
    # Check if 'is_fraud' column is in the chunk to align with predictions
    if 'is_fraud' in chunk_cleaned.columns:
        y_test_chunk = chunk_cleaned['is_fraud']
        y_test_all.append(y_test_chunk)
    
    # Apply preprocessing
    X_test_preprocessed = pipeline.named_steps['preprocessor'].transform(X_test)
    
    # Make predictions
    chunk_predictions = pipeline.named_steps['classifier'].predict(X_test_preprocessed)
    
    # Collect predictions and true labels
    y_pred_all.append(chunk_predictions)
    
    # Filter only the fraud predictions (i.e., where the prediction is 1)
    fraud_chunk = chunk[chunk_predictions == 1]
    fraudulent_predictions.append(fraud_chunk)

# Combine all predictions and true labels into single DataFrames
y_test_all = pd.concat(y_test_all, ignore_index=True)
y_pred_all = np.concatenate(y_pred_all)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test_all, y_pred_all))
print("Confusion Matrix:\n", confusion_matrix(y_test_all, y_pred_all))

# Combine all fraudulent predictions into a single DataFrame
fraudulent_predictions_df = pd.concat(fraudulent_predictions, ignore_index=True)

# Print the fraudulent predictions
print("Fraudulent predictions:")
print(fraudulent_predictions_df)



Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.91      0.95    553574
           1       0.02      0.54      0.04      2145

    accuracy                           0.91    555719
   macro avg       0.51      0.72      0.50    555719
weighted avg       0.99      0.91      0.95    555719

Confusion Matrix:
 [[502705  50869]
 [   991   1154]]
Fraudulent predictions:
       Unnamed: 0 trans_date_trans_time               cc_num  \
0              58   2020-06-21 12:32:36     2227671554547514   
1              67   2020-06-21 12:34:07     3502377050801561   
2             100   2020-06-21 12:45:48     6592074504369261   
3             101   2020-06-21 12:46:08  4400011257587661852   
4             133   2020-06-21 12:55:19        4683638447911   
...           ...                   ...                  ...   
52018      555533   2020-12-31 22:53:19  4209696857872688515   
52019      555605   2020-12-31 23:17:43     406057972652823

In [8]:
# Define the path to the test data
test_file_path = '../data/fraudTest.csv'

# Process test data in chunks
chunk_size = 500000  # Adjust chunk size based on memory constraints
fraudulent_predictions = []

for chunk in pd.read_csv(test_file_path, chunksize=chunk_size):
    # Drop unnecessary columns (same as training data)
    chunk_cleaned = chunk.drop(columns=COLUMNS_TO_DROP, errors='ignore')
    
    # Check class distribution in the current chunk
    if 'is_fraud' in chunk_cleaned.columns:
        print("Class distribution in chunk:")
        print(chunk_cleaned['is_fraud'].value_counts())

    # Ensure the same feature columns as used in training
    feature_columns = model.named_steps['preprocessor'].transformers_[0][2] + \
                      model.named_steps['preprocessor'].transformers_[1][2]
    X_test = chunk_cleaned[feature_columns]
    
    # Apply preprocessing
    X_test_preprocessed = model.named_steps['preprocessor'].transform(X_test)
    
    # Make predictions
    chunk_predictions = model.named_steps['classifier'].predict(X_test_preprocessed)
    
    # Check prediction distribution
    print("Prediction distribution in chunk:")
    print(pd.Series(chunk_predictions).value_counts())
    
    # Filter only the fraud predictions (i.e., where the prediction is 1)
    fraud_chunk = chunk[chunk_predictions == 1]
    
    # Collect fraudulent predictions
    fraudulent_predictions.append(fraud_chunk)

# Combine all fraudulent predictions into a single DataFrame
fraudulent_predictions_df = pd.concat(fraudulent_predictions, ignore_index=True)

# Print the fraudulent predictions
print("Fraudulent predictions:")
print(fraudulent_predictions_df)

Class distribution in chunk:
is_fraud
0    497876
1      2124
Name: count, dtype: int64




Prediction distribution in chunk:
0    455011
1     44989
Name: count, dtype: int64
Class distribution in chunk:
is_fraud
0    55698
1       21
Name: count, dtype: int64
Prediction distribution in chunk:
0    51228
1     4491
Name: count, dtype: int64
Fraudulent predictions:
       Unnamed: 0 trans_date_trans_time               cc_num  \
0              58   2020-06-21 12:32:36     2227671554547514   
1              67   2020-06-21 12:34:07     3502377050801561   
2             100   2020-06-21 12:45:48     6592074504369261   
3             101   2020-06-21 12:46:08  4400011257587661852   
4             133   2020-06-21 12:55:19        4683638447911   
...           ...                   ...                  ...   
49475      555533   2020-12-31 22:53:19  4209696857872688515   
49476      555571   2020-12-31 23:05:07     3550554389581384   
49477      555605   2020-12-31 23:17:43     4060579726528237   
49478      555629   2020-12-31 23:26:42     2248735346244816   
49479      555637   