In [1]:
import numpy as np 
import pandas as pd
import os
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,classification_report,make_scorer, f1_score
from sklearn.utils import resample
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import model_selection
from joblib import dump, load
import mlflow.sklearn
import mlflow
# import pymysql
from mlflow.models import infer_signature
from Feature_engineer import feature_engineer_steps
from mlflow.tracking import MlflowClient

ModuleNotFoundError: No module named 'Feature_engineer'

In [None]:
transactions_df = pd.read_csv("data/transactions_df.csv")
terminal_profiles_df = pd.read_csv("data/terminal_profiles_table.csv")
customer_profiles_df = pd.read_csv("data/customer_profiles_table.csv")
join_terminal = pd.merge(transactions_df, terminal_profiles_df, on='terminal_id', how='inner') #join dataset base on key value
join_customer = pd.merge(join_terminal, customer_profiles_df, on='customer_id', how='inner')


In [None]:
print(join_customer.columns.tolist())

In [None]:
#creating sample file for client demo purpose
samle_file = transactions_df.sample(n=100, random_state=42)
samle_file.to_csv('data/user_demo_data.csv', index=False)

In [None]:
# Feature engineer step and one-hot enconding for categorical feature. 
# Stored in Feature_engineer.py
train_X,train_y = feature_engineer_steps(join_customer)

In [None]:
print(train_X.columns.tolist())

In [None]:
#  Feature selection as part of the default pipeline
def remove_unwanted_col(train):
    
    columns =  list(train.columns)
    entries_to_remove = ['transaction_id', # remove unwanted column, and all the mostly IDs
                         'bin_y',
                         'mcc',
                         'bin_x',
                         'customer_id', 
                         'available_terminals',
                         'terminal_id',
                         'timestamp',                     
                         #'lat_terminal',
                         #'log_terminal',
                         #'lat_customer',
                         #'log_customer',
                         #'mean_amount',
                         #'mean_nb_tx_per_day',
                         'date',
                         'post_ts',
                         'using_available_terminals',
                            #'timestamp_numeric',
                         #'per_day_difference_count',
                         
                         ] 
    features = [col for col in columns if col not in entries_to_remove]
    train = train[features]
    return train
train_X = remove_unwanted_col(train_X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y,test_size=0.05, random_state=42)

In [None]:
best_model = load('../saved_model/best_model.joblib')
y_pred = best_model.predict(X_test)
y_pred[y_pred == -1] = 0
anomalies = X_test[y_pred == 0]
anomalies_index = anomalies.index



In [None]:
#show result in confusion matrix plot and return model metric 
def show_result(test,pred):
        
    cm = confusion_matrix(test, pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.savefig("confusion_matrix.png")
    #plt.show()
    report = classification_report(test, pred, output_dict=True)    
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    f1_score = report['weighted avg']['f1-score']
    return precision, recall, f1_score
    
def pred_baseon_threshold(model, test_data,threshold):
    test_scores = model.decision_function(test_data)
    num_values_below_threshold = np.sum(test_scores > threshold) # having score higher than threshold are anomalies
    pred = test_scores
    pred[test_scores < threshold] = 0
    pred[test_scores != 0] = 1
    return pred
    
def make_use_reject_anomalies(model, test_data, position,sensitivity,current_threshold):
    test_scores = model.decision_function(test_data)
    num_values_below_threshold = np.sum(test_scores > current_threshold) # having score higher than threshold are anomalies
    pred = test_scores
    pred[test_scores < current_threshold] = 0 #none anomalies

    anomalies = pred[pred != 0] #extract anomalies
    for index in position:
        current_threshold += anomalies[index] * sensitivity
    new_threshold = current_threshold
    return new_threshold
def export_anomaly(original_df, pred_list):
    now = datetime.now()
    date_time = now.strftime("%Y%m%d_%H%M%S")
    label_series = pd.Series(pred_list)
    anomalies= original_df[label_series == 1]
    filename = f"export_anomaly/anomaly_{date_time}.csv"
    anomalies.to_csv(filename, index=False)
    return 0

In [None]:
train_scores = best_model.decision_function(X_train)
train_threshold = np.percentile(train_scores, 15) #auto in scikit learn v0.22 and later has 0.5% contamination set
y_pred = pred_baseon_threshold(best_model, X_test, train_threshold)
adjusted_threshold = make_use_reject_anomalies(best_model, X_test, [1,3,5,6], 0.005,train_threshold)


In [None]:
precision, recall, f1_socre = show_result(y_test,y_pred)
metrics = {"precision": precision, "recall": recall, "f1_socre": f1_socre,"train_threshold":train_threshold}
params = best_model.get_params()

***Mlflow related code***

In [None]:
#listening to port
mlflow.set_tracking_uri("http://127.0.0.1:8080")


# Create a new MLflow Experiment
mlflow.set_experiment("Isolation Forest")

In [None]:
# log model into mlflow
artifact_path = "artifact_location"

# Initiate the MLflow run context
with mlflow.start_run() as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)
    mlflow.log_artifact(local_path = "Feature_engineer.py")
    mlflow.log_metrics(metrics)
    mlflow.log_artifact("confusion_matrix.png")
    # Log an instance of the trained model for later use
    model_info = mlflow.sklearn.log_model(sk_model=best_model, artifact_path="artifact_location",input_example=X_train,signature = infer_signature(X_test, y_test))
