# <center>MODEL DEVELOPMENT AND EVALUATION</center>

In [1]:
import pandas as pd
file_path = "../data/data_description.csv" 
dc = pd.read_csv(file_path)
df = pd.read_parquet('../data/data.parquet')
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
train = df[df['trans_date_trans_time'] <= pd.to_datetime('2020-06-21 12:13:37')]
test = df[df['trans_date_trans_time'] >= pd.to_datetime('2020-06-21 12:14:25')]
train.to_parquet('../data/train.parquet')
test.to_parquet('../data/test.parquet')

## Model Selection

Given the nature of fraud detection, where the goal is to distinguish between fraudulent and non-fraudulent transactions, ensemble learning algorithms such as **Random Forest** and **Gradient Boosting** are suitable. These algorithms handle complex, non-linear relationships within the data and are robust against overfitting. Also, the ability of ensemble methods to combine multiple weak learners into a strong learner makes them effective for detecting patterns indicative of fraud.

Considering the binary nature of the classification task (fraud or non-fraud), **Logistic Regression** can serve as a baseline model due to its simplicity and interpretability. It provides a clear understanding of the impact of features on the likelihood of fraud. 

## Model Training

In [2]:
from sklearn.ensemble import RandomForestClassifier ,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [3]:
# Separate features and target variable for training data
X_train = train.drop(['is_fraud', 'trans_date_trans_time', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip'], axis=1)
y_train = train['is_fraud']

In [4]:
# Label encode categorical columns
label_encoder = LabelEncoder()
X_train_encoded = X_train.copy()

for column in X_train.select_dtypes(include=['object']).columns:
    X_train_encoded[column] = label_encoder.fit_transform(X_train[column])

In [5]:
# Initialize models
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
logistic_regression_model = LogisticRegression(random_state=42)

In [6]:
# Train models
random_forest_model.fit(X_train_encoded, y_train)

In [7]:
gradient_boosting_model.fit(X_train_encoded, y_train)

In [8]:
logistic_regression_model.fit(X_train_encoded, y_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
import joblib

joblib.dump(random_forest_model, 'random_forest_model.pkl')
joblib.dump(gradient_boosting_model, 'gradient_boosting_model.pkl')
joblib.dump(logistic_regression_model, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']

### hyperparameters

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Define a smaller parameter grid
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [None]:
# Create the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

In [None]:
# Create RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_grid, n_iter=5, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)

In [None]:
# Fit the model
random_search.fit(X_train_encoded, y_train)

In [None]:
# Get the best parameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

## Model Evaluation

## Conclusion