In [1]:

# import libraries
# MIT License

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

# import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report, confusion_matrix,\
roc_auc_score, roc_curve, precision_recall_curve, fbeta_score, recall_score,\
precision_recall_fscore_support
from sklearn.inspection import permutation_importance

# import classificators
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# import feature selection
from sklearn.feature_selection import VarianceThreshold 
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE

# import sampler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek
# K-Nearest Neighbor OveRsampling (KNNOR)
from knnor import data_augment

# import classificators
from sklearn.ensemble import RandomForestClassifier

# import regressors
from sklearn.ensemble import RandomForestRegressor

from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

from scipy.special import inv_boxcox

# import library for feature engineering
# autofeat: Automated Feature Engineering Toolkit
# @inproceedings{horn2019autofeat,
#   title={The autofeat Python Library for Automated Feature Engineering and Selection},
#   author={Horn, Franziska and Pack, Robert and Rieger, Michael},
#   booktitle={Joint European Conference on Machine Learning and Knowledge Discovery in Databases},
#   pages={111--120},
#   year={2019},
#   organization={Springer}
# }
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor


from autofeat import FeatureSelector
# import system
import os
import sys

import scipy.stats as stats

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os

# import scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sigfig import round
import itertools

import warnings
warnings.filterwarnings("ignore")

In [2]:
def calculate_profit(tp, fp, fn, tn, benefit_tp, cost_fp, cost_fn, benefit_tn):
    total_profit = (tp * benefit_tp) - (fp * cost_fp) - (fn * cost_fn) + (tn * benefit_tn)
    return total_profit

def plot_total_profit_bar_chart(keywords,results_df):
    # Define cost-benefit parameters
    benefit_tp = 60   # Benefit from a True Positive
    cost_fp = 40      # Cost of a False Positive
    cost_fn = 55      # Cost of a False Negative
    benefit_tn = 10   # Benefit of a True Negative

    plt.figure(figsize=(12, 8))

    total_profits = []

    for idx, row in results_df.iterrows():
        keyword = row[keywords]

        # Extract confusion matrix components
        tn, fp = row['Confusion_Matrix'][0]
        fn, tp = row['Confusion_Matrix'][1]

        # Calculate total profit
        total_profit = calculate_profit(tp, fp, fn, tn, benefit_tp, cost_fp, cost_fn, benefit_tn)
        total_profits.append((keyword, total_profit))

    # Sort feature selectors by total profit for better visualization
    total_profits.sort(key=lambda x: x[1], reverse=True)

    # Plotting
    feature_selectors, profits = zip(*total_profits)
    plt.bar(feature_selectors, profits, color='skyblue')
    plt.ylabel('Total Estimated Profit')
    plt.title('Total Profit Comparison for Different Feature Selectors')
    plt.xticks(rotation=45)
    plt.show()

In [3]:
X = pd.read_csv('../data/cls/cls_encoder.csv')
y = pd.read_csv('../data/cls/cls_target.csv')
X_df = pd.DataFrame(X)
y_df = pd.DataFrame(y)
y_df = y_df.values.ravel()

In [4]:
def train_model(X, y, model, sampler, feature_selector):
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)
    
    # Scaling the training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # apply sampler
    X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    
    # apply feature selector
    X_train_selected = feature_selector.fit_transform(X_train_res, y_train_res)
    X_test_selected = feature_selector.transform(X_test_scaled)
    
    # Check the number of features after selection
    n_features = X_train_selected.shape[1]
    
    if n_features > 1:
        # Fit PCA if enough features are available
        pca = PCA(n_components=min(n_features, 2))  # Adjust n_components based on available features
        X_train_pca = pca.fit_transform(X_train_selected)
        X_test_pca = pca.transform(X_test_selected)
    else:
        # Skip PCA if only one feature is present
        X_train_pca = X_train_selected
        X_test_pca = X_test_selected

    # fit model
    model.fit(X_train_pca, y_train_res)
    # predict
    y_pred = model.predict(X_test_pca)
    
    # # evaluate
    train_score = model.score(X_train_pca, y_train_res)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred, output_dict=True)
    cross_val_scores = cross_val_score(model, X_train_pca, y_train_res, cv=5)
    cross_val_mean = cross_val_scores.mean()
    aoc_score = roc_auc_score(y_test, y_pred)
    
    result = {
        'training_score': train_score,
        'accuracy': accuracy,
        'precision': precision,
        'Confusion_Matrix': cm,
        'Classification_Report': cr,
        'cross_val_scores': cross_val_scores,
        'cross_val_mean': cross_val_mean,
        'aoc_score': aoc_score
    }
    return result


In [5]:
def visualize_results(result):
    # Extracting recall and F1-score from classification reports
    recall_scores_avg = []
    recall_scores_1 = []
    f1_scores = []
    precision_scores = []
    keyword = results_df[keywords].tolist()

    for report in result['classification_report']:
        recall_scores_avg.append(report['weighted avg']['recall'])
        recall_scores_1.append(report['1']['recall'])
        f1_scores.append(report['weighted avg']['f1-score'])
        precision_scores.append(report['weighted avg']['precision'])

    # Creating a DataFrame for visualization
    metrics_df = pd.DataFrame({
        keywords: keyword,
        'Recall on average': recall_scores_avg,
        'Recall on default': recall_scores_1,
        'F1-Score': f1_scores,
        'Precision': precision_scores,
        'cross_val_scores': result['cross_val_scores'].tolist()
    })
    
    # Plotting
    plt.figure(figsize=(12, 6))
    sns.barplot(x=keywords, y='value', hue='variable', data=pd.melt(metrics_df, id_vars=keywords))
    plt.title('Recall and F1-Score for Different Samplers')
    plt.ylabel('Score')
    plt.xlabel(keywords)
    plt.xticks(rotation=45)
    plt.legend(title='Metric')
    plt.show()

# Logistic Regression:

In [6]:
import sys
import time
import itertools

# Import other necessary libraries

def grid_search_from_scratch(X, y, model, sampler, feature_selector, parameter_grid):
    keys, values = zip(*parameter_grid.items())
    parameter_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    total_combinations = len(parameter_combinations)
    report = []
    
    start_time = time.time()
    
    for idx, params in enumerate(parameter_combinations):
        model.set_params(**params)
        results = train_model(X, y, model, sampler, feature_selector)
        
        report.append({
            'params': params,
            'results': results
        })
        
        elapsed_time = time.time() - start_time
        avg_time_per_iteration = elapsed_time / (idx + 1)
        estimated_remaining_time = avg_time_per_iteration * (total_combinations - idx - 1)

        sys.stdout.write(f'\rProgress: {idx + 1}/{total_combinations} (Estimated Time Remaining: {estimated_remaining_time:.2f} seconds)')
        sys.stdout.flush()
    
    return report

final_sampler = SVMSMOTE(random_state=33)
final_feature_selector = SelectKBest(f_classif, k=5)

In [7]:
final_sampler = SVMSMOTE(random_state=33)
final_feature_selector = SelectKBest(f_classif, k=5)

In [8]:
parameters_lg = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'max_iter': [100, 200, 400, 500],         # Maximum number of iterations to converge
    'penalty': ['l2'],        # Type of regularization
    'solver': ['newton-cholesky', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Algorithm to use for optimization
    'random_state': [33]
}

logistic_regression_report = grid_search_from_scratch(X_df, y_df, LogisticRegression(), final_sampler, final_feature_selector, parameters_lg)
# save report:
import pickle
pickle.dump(logistic_regression_report, open('../data/cls/logistic_regression_report.pkl', 'wb'))

Progress: 100/100 (Estimated Time Remaining: 0.00 seconds))

In [10]:
parameters_svm = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel function to use
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'degree': [2, 3, 4],  # Degree of the polynomial kernel function
    'cache_size': [700],
    'max_iter': [100, 200, -1],         # Maximum number of iterations to converge
    'random_state': [33]
}

svm_report = grid_search_from_scratch(X_df, y_df, SVC(), final_sampler, final_feature_selector, parameters_svm)
pickle.dump(svm_report, open('../data/cls/svm_report.pkl', 'wb'))

Progress: 35/360 (Estimated Time Remaining: 71843.98 seconds)

In [None]:
parameters_random_forest = {
    'n_estimators': [100, 200, 400, 500],  # Number of trees in the forest
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 10, 20, 50, 100],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'bootstrap': [True, False],  # Whether bootstrap samples are used when building trees
    'random_state': [33],
    'n_jobs': [-1]
}

random_forest_report = grid_search_from_scratch(X_df, y_df, RandomForestClassifier(), final_sampler, final_feature_selector, parameters_random_forest)
pickle.dump(random_forest_report, open('../data/cls/random_forest_report.pkl', 'wb'))