In [1]:
import os
import gc
import pandas as pd
import numpy as np
import json
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import importlib
####
import hashlib
import nltk
from nltk.tokenize import word_tokenize
####
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
######
nltk.download('punkt')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Helper Functions

In [2]:
def flatten_list_recursive(nested_list):
    flat_list = []
    for item in nested_list:
        if isinstance(item, list):
            flat_list.extend(flatten_list_recursive(item))
        else:
            flat_list.append(item)
    return flat_list

def tokenize_category(category):
    # Tokenize the category using NLTK's word_tokenize
    tokens = word_tokenize(category.lower())
    return tokens

def hash_tokens(tokens):
    # Hash the tokens using SHA-1 # sha-256 is too long
    hashed_tokens = [hashlib.sha1(token.encode()).hexdigest() for token in tokens]
    return hashed_tokens


def get_hash_tokenised(df, cat_col):
    #
    uidx = df[cat_col].drop_duplicates().index
    # Apply tokenization and hashing to the 'category_column'
    df['tokens'] = df[cat_col].apply(tokenize_category)
    df['hashed_tokens'] = df['tokens'].apply(hash_tokens)

    # One-hot encode the hashed tokens
    hashed_tokens_df = pd.get_dummies(df['hashed_tokens'].apply(pd.Series).stack()).groupby(level=0).sum()

    # Combine the one-hot encoded DataFrame with the original DataFrame
    df = pd.concat([df, hashed_tokens_df], axis=1)

    # Drop the unnecessary columns
    decrypt_labels = list(zip(flatten_list_recursive(df.tokens[uidx]),
                              flatten_list_recursive(df.hashed_tokens[uidx])))
  
    return df.drop(columns=[cat_col, 'tokens', 'hashed_tokens']), decrypt_labels\
    

def load_sklearn_function(module_name, function_name):
    """
    Load a specific function from a given scikit-learn module.

    Parameters:
        module_name (str): The name of the scikit-learn module to import (e.g., 'sklearn.linear_model').
        function_name (str): The name of the function to load from the specified module.

    Returns:
        function: The desired function object if found, or None if not found.
    """
    try:
        module = importlib.import_module(module_name)
        function = getattr(module, function_name, None)
        return function
    except ImportError:
        print(f"Error: The module '{module_name}' could not be imported.")
        return None
    except AttributeError:
        print(f"Error: The function '{function_name}' was not found in module '{module_name}'.")
        return None

In [3]:
def data_reducer_trees(df, num_trees, depth, num_final_columns):
    # Separate features and target variable
    X = df.drop(columns='target')
    y = df['target']

    # Create a Random Forest Regressor model
    model = RandomForestRegressor(n_estimators = num_trees, max_depth = depth)

    # Fit the model to calculate feature importances
    model.fit(X, y)

    # Get feature importances
    feature_importances = model.feature_importances_

    # Sort the features based on importance scores
    sorted_features = sorted(zip(X.columns, feature_importances), key=lambda x: x[1], reverse=True)

    # Select the top N features based on the desired number of final columns
    selected_features = [feature[0] for feature in sorted_features[:num_final_columns]]

    # Create the reduced dataframe with the top N features and the target column
    reduced_df = df[selected_features + ['target']]

    return reduced_df

In [4]:
def data_reducer_pca(df, target_column_name, num_final_columns):
    # Separate features and target variable
    X = df.drop(columns=target_column_name)
    y = df[target_column_name]

    # Create PCA object with the desired number of final columns
    pca = PCA(n_components=num_final_columns)

    # Fit and transform the data to the reduced number of columns
    reduced_features = pca.fit_transform(X)

    # Create a new DataFrame with the reduced features and the target column
    reduced_df = pd.DataFrame(data=reduced_features, columns=[f"PC{i}" for i in range(1, num_final_columns + 1)])
    reduced_df[target_column_name] = y

    return reduced_df

In [5]:
def data_reduer_corr(df, target_column_name, num_final_columns):
    # Calculate the correlation between each feature and the target variable
    correlations = df.corr()[target_column_name].abs().sort_values(ascending=False)

    # Exclude the target variable from the correlation results
    correlations = correlations.drop(target_column_name)

    # Select the top N features based on the desired number of final columns
    selected_features = list(correlations.index[:num_final_columns])

    # Create the reduced dataframe with the top N features and the target column
    reduced_df = df[selected_features + [target_column_name]]

    return reduced_df


In [6]:
def find_min_max_pairs(input_list):
    min_max_pairs = []
    remaining_items = set(input_list)  # Convert input_list to a set for faster lookups
    for item in input_list:
        if 'max' in item:
            min_item = item.replace('max', 'min')
            if min_item in remaining_items:
                min_max_pairs.append((min_item, item))
                remaining_items.remove(min_item)  # Remove the found items from the set
                remaining_items.remove(item)
    return min_max_pairs, list(remaining_items)  # Convert set back to list for output

In [7]:
data = [
    ("LinearRegression", "regression", "sklearn.linear_model", "LinearRegression"),
    ("SGD", "regression", "sklearn.linear_model", "SGDRegressor"),
    ("LassoRegression", "regression", "sklearn.linear_model", "Lasso"),
    ("xg_boost", "regression", "NA", "NA"),
    ("DecisionTreeRegressor", "regression", "sklearn.tree", "DecisionTreeRegressor"),
    ("extra_random_trees", "regression", "sklearn.tree", "ExtraTreeRegressor"),
    ("GBTRegressor", "regression", "sklearn.ensemble", "GradientBoostingRegressor"),
    ("RandomForestRegressor", "regression", "sklearn.ensemble", "RandomForestRegressor"),
    ("SVM", "regression", "sklearn.svm", "SVR"),
    ("neural_network", "regression", "sklearn.neural_network", "MLPRegressor"),
    ("RidgeRegression", "regression", "sklearn.linear_model", "Ridge"),
    ("ElasticNetRegression", "regression", "sklearn.linear_model", "ElasticNet"),
    ("KNN", "regression", "neighbors", "KNeighborsRegressor"),
    ("Logistic Regression", "classification", "sklearn.linear_model", "LogisticRegression"),
    ("neural_network", "classification", "sklearn.neural_network", "MLPClassifier"),
    ("SGD", "classification", "sklearn.linear_model", "SGDClassifier"),
    ("RandomForestClassifier", "classification", "sklearn.ensemble", "RandomForestClassifier"),
    ("SVM", "classification", "sklearn.svm", "SVC"),
    ("xg_boost", "classification", "NA", "NA"),
    ("DecisionTreeClassifier", "classification", "sklearn.tree", "DecisionTreeClassifier"),
    ("extra_random_trees", "classification", "sklearn.tree", "ExtraTreeClassifier"),
    ("GBTClassifier", "classification", "sklearn.ensemble", "GradientBoostingClassifier"),
    ("KNN", "classification", "neighbors", "KNeighborsClassifier")
]


model_look_up = pd.DataFrame(data, columns=["user_model_name", "task", "module", "function"])

model_look_up

del data

Unnamed: 0,user_model_name,task,module,function
0,LinearRegression,regression,sklearn.linear_model,LinearRegression
1,SGD,regression,sklearn.linear_model,SGDRegressor
2,LassoRegression,regression,sklearn.linear_model,Lasso
3,xg_boost,regression,,
4,DecisionTreeRegressor,regression,sklearn.tree,DecisionTreeRegressor
5,extra_random_trees,regression,sklearn.tree,ExtraTreeRegressor
6,GBTRegressor,regression,sklearn.ensemble,GradientBoostingRegressor
7,RandomForestRegressor,regression,sklearn.ensemble,RandomForestRegressor
8,SVM,regression,sklearn.svm,SVR
9,neural_network,regression,sklearn.neural_network,MLPRegressor


# Read data

In [8]:
json_fpath = r'InputData\algoparams_from_ui.json'
with open(json_fpath) as obj:
  json_content = obj.read()
parsed_json_data = json.loads(json_content)
# print(json_content)

# Explore json file content

In [9]:
json_keys = list(parsed_json_data.keys())
json_keys
parsed_json_data[json_keys[0]]
parsed_json_data[json_keys[1]]
# 'design_state_data' states the requirements
parsed_json_data[json_keys[2]]

['session_name', 'session_description', 'design_state_data']

'test'

'test'

{'session_info': {'project_id': '1',
  'experiment_id': 'kkkk-11',
  'dataset': 'iris_modified.csv',
  'session_name': 'test',
  'session_description': 'test'},
 'target': {'prediction_type': 'Regression',
  'target': 'petal_width',
  'type': 'regression',
  'partitioning': True},
 'train': {'policy': 'Split the dataset',
  'time_variable': 'sepal_length',
  'sampling_method': 'No sampling(whole data)',
  'split': 'Randomly',
  'k_fold': False,
  'train_ratio': 0,
  'random_seed': 0},
 'metrics': {'optomize_model_hyperparameters_for': 'AUC',
  'optimize_threshold_for': 'F1 Score',
  'compute_lift_at': 0,
  'cost_matrix_gain_for_true_prediction_true_result': 1,
  'cost_matrix_gain_for_true_prediction_false_result': 0,
  'cost_matrix_gain_for_false_prediction_true_result': 0,
  'cost_matrix_gain_for_false_prediction_false_result': 0},
 'feature_handling': {'sepal_length': {'feature_name': 'sepal_length',
   'is_selected': True,
   'feature_variable_type': 'numerical',
   'feature_details

In [10]:
rqs = parsed_json_data[json_keys[2]]
main_keys = list(rqs.keys())
# main_keys

In [11]:
# for ks in main_keys:
#     print(f'{ks} contains {rqs[ks]}')

In [12]:
# repo_algo_dict.keys()
# for algo_name in list(repo_algo_dict.keys()):
#     repo_algo_dict[algo_name]

In [13]:
# for item in rqs['feature_handling'].keys():
#     rqs['feature_handling'][item]
       

# Analysis

In [14]:
# Read the target and type of regression to be run
rqs['target']
fpath = r'InputData\iris.csv'
iris_data = pd.read_csv(fpath)
iris_data.info()
#Read the features (which are column names in the csv) and 
#figure out what missing imputation needs to be applied 
# and apply that to the columns loaded in a dataframe

{'prediction_type': 'Regression',
 'target': 'petal_width',
 'type': 'regression',
 'partitioning': True}

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [15]:
traget_name = rqs['target']['target']
traget_name

'petal_width'

In [16]:
# what columns are to be used for analysis
col_booleans = [rqs['feature_handling'][col]['is_selected'] for col in iris_data.columns]
iris_data_sub = iris_data.loc[:,col_booleans]
# setting right the dtypes of all selected columns
col_dtypes = [rqs['feature_handling'][col]['feature_variable_type'] for col in iris_data_sub.columns]
for idx, x in enumerate(col_dtypes):
    if x == 'numerical':
        if is_numeric_dtype(iris_data_sub.iloc[:,idx]):
            pass
        else:
            print('rectified numeric data with fase dtype')
            iris_data_sub.iloc[:,idx] = pd.to_numeric(iris_data_sub.iloc[:,idx], errors = 'coerce')
    elif x == 'text':
        if is_string_dtype(iris_data_sub.iloc[:,idx]):
            pass
        else:
            print('rectified text data with fase dtype')
            iris_data_sub.iloc[:,idx] =  iris_data_sub.iloc[:,idx].astype(str)
############replace blanks with np.nan
iris_data_sub = iris_data_sub.replace(r'^\s*$', np.nan, regex=True)
###########check missing data#####
msn_stats = iris_data_sub.isna().sum()
msn_cols = list(msn_stats[msn_stats==0].index)
if msn_stats.sum() == 0:
    print('Yay! No missing data')
else:
    print('Oops! missing data found')
    msn_cols = list(msn_stats[msn_stats==0].index)
    print('Check {msn_cols}')
# del msn_stats
gc.collect();
#######what to do with missing data######
for col in msn_cols:
    try:
        if rqs['feature_handling'][col]['feature_details']['missing_values'] == 'Impute':
            
            imp_func = rqs['feature_handling'][col]['feature_details']['impute_with']
            if imp_func == 'Average of values':
                iris_data_sub[col] = iris_data_sub[col].fillna(iris_data_sub[col].mean())
                print(f'Imputing {col} with mean')
            elif imp_func == 'custom':
                imp_val = rqs['feature_handling'][col]['feature_details']['impute_value']
                iris_data_sub[col] = iris_data_sub[col].fillna(imp_val)
                print(f'Imputing {col} with custom value {imp_val}')
            else:
                raise ValueError
                print('Imputation technique not supported')
    except KeyError:
        pass
iris_data_sub.head()
reduct_boolean = True
#######reduce data###########
#No Reduction, Corr with Target, Tree-based, PCA.
if 'No Reduction' in list(rqs['feature_reduction'].keys()):
    print('no feature reduction necessary')
    reduct_boolean = False
    pass
else:
    if any(isinstance(i,dict) for i in rqs['feature_reduction'].values()): # if nested
        print('feature reduction input should not be a nested dictionary')
    else:
        freduc_name = rqs['feature_reduction']['feature_reduction_method']
        if (iris_data_sub.shape[1] -1)  == int(rqs['feature_reduction']['num_of_features_to_keep']):
            print('no feature reduction necessary')
            reduct_boolean = False
        elif (iris_data_sub.shape[1] -1) > int(rqs['feature_reduction']['num_of_features_to_keep']):
            if freduc_name.lower() == 'tree-based':
                    num_trees = int(rqs['feature_reduction']['num_of_trees'])
                    depth = int(rqs['feature_reduction']['depth_of_trees'])
                    iris_data_subr = data_reducer_trees(iris_data_sub,  num_trees, depth,
                                    num_final_columns = int(rqs['feature_reduction']['num_of_features_to_keep']))
            elif freduc_name.lower() == 'correlation with target':
                if 'text' in col_dtypes:
                    print('cannot apply pearson corr to categorical datatype')
                    raise ValueError
                else:
                    iris_data_subr = data_reduer_corr(iris_data_sub, 
                                                      target_column_name=traget_name, 
                                                      num_final_columns=int(rqs['feature_reduction']['num_of_features_to_keep']))
            elif freduc_name.lower() == 'pca':
                    iris_data_subr = data_reducer_pca(iris_data_sub, 
                                                      target_column_name=traget_name, 
                                                      num_final_columns=int(rqs['feature_reduction']['num_of_features_to_keep']))    
            else:
                raise ValueError
            print('reduction method not supported')
                 
        else:
            raise ValueError
        
if not reduct_boolean:
    iris_data_subr = iris_data_sub.copy()
    del iris_data_sub
    # #######any feature  encoding###########
    feature_handling_keys = [x+'_handling' for x in col_dtypes]
    feature_handling_actions = [rqs['feature_handling'][col]['feature_details'][key] for col,key in zip(iris_data_subr.columns,feature_handling_keys)]
    feature_handling_actions  = [None if 'Keep as' in x else x for x in feature_handling_actions]
    for idx, item in enumerate(feature_handling_actions):
        if item is None:
            pass
        elif item == 'Tokenize and hash':
            cat_col = iris_data_subr.columns[idx]
            print(f'{cat_col} column is tokenized and hashed')
            iris_data_subr, label_mapping = get_hash_tokenised(iris_data_subr, cat_col)   
        else:
            raise ValueError
    encoded_cols = [x[1] for x in label_mapping]
    encoded_cols
    ###########feature reduction###########


Yay! No missing data


0

Imputing sepal_length with mean
Imputing sepal_width with custom value -1
Imputing petal_length with mean
Imputing petal_width with custom value -2


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


no feature reduction necessary
species column is tokenized and hashed


['b2bd7ff5fd2270c1025fbb6413ec93a67607be31',
 '1739de3503d610e295d933ae2616e7be78b02d43',
 '43825f1f7dae87f9103b3ddf764783a01f57b982']

In [17]:
#whether classification or regression
rqs['target']['prediction_type']
# get algorithm
repo_algo_dict = rqs['algorithms']
for algo_name in list(repo_algo_dict.keys()):
    # repo_algo_dict[algo_name]
    if repo_algo_dict[algo_name]['is_selected'] == False:
        pass
    else:
        repo_algo_dict[algo_name]
        algo2use = algo_name
        params = repo_algo_dict[algo_name]
        break
    print('-----')

'Regression'

-----


{'model_name': 'Random Forest Regressor',
 'is_selected': True,
 'min_trees': 10,
 'max_trees': 20,
 'feature_sampling_statergy': 'Default',
 'min_depth': 20,
 'max_depth': 25,
 'min_samples_per_leaf_min_value': 5,
 'min_samples_per_leaf_max_value': 10,
 'parallelism': 0}

In [18]:
model_basket = []
for algo_name in list(repo_algo_dict.keys()):
    model_basket.append(repo_algo_dict[algo_name]['model_name'])
if rqs['target']['prediction_type'].lower() == 'regression':
    algo_universe = [
                    'LinearRegression', #sklearn.linear_model,linear_model.SGDRegressor
                    'SGD', #sklearn.linear_model
                    'LassoRegression', #sklearn.linear_model
                    'ElasticNetRegression', #sklearn.linear_model
                    'xg_boost', # import xgboost
                    'DecisionTreeRegressor', #sklearn.tree, DecisionTreeRegressor
                    'extra_random_trees', #sklearn.tree, ExtraTreeRegressor
                    'GBDTRegressor', #sklearn.ensemble.GradientBoostingRegressor
                    'RandomForestRegressor', #ensemble.RandomForestRegressor
                    'SVM', # sklearn.svm.SVR
                    'neural_network', # sklearn.neural_network.MLPRegressor
                    'RidgeRegression' #sklearn.linear_model, linear_model.Ridge
                    'KNN' #neighbors.KNeighborsRegressor
                    ]
else:
    algo_universe = ['LogisticRegression',#sklearn.linear_model.LogisticRegression¶
                     'SGD', #sklearn.linear_model, linear_model.SGDClassifier
                    'RandomForestClassifier', #ensemble.RandomForestClassifier
                    'SVM',# sklearn.svm.SVC
                    'xg_boost', # import xgboost
                        'DecisionTreeClassifier', ##sklearn.tree, DecisionTreeClassifier
                    'extra_random_trees', ##sklearn.tree, ExtraTreeClassifier
                    'GBTClassifier', # #sklearn.ensemble.GradientBoostingClassifier
                    'neural_network', # sklearn.neural_network.MLPClassifier
                    'KNN' # neighbors.KNeighborsClassifier
                    ]
if algo2use not in algo_universe:
    print('wrong model for the task')
else:
    loaded_sklearn_model = load_sklearn_function(module_name = model_look_up.loc[model_look_up.user_model_name == algo2use, 'module'].item(), 
                                                function_name = model_look_up.loc[model_look_up.user_model_name == algo2use, 'function'].item())
    loaded_sklearn_model 
#do hyper parameter tuning i.e., use GridSearchCV

sklearn.ensemble._forest.RandomForestRegressor

In [19]:
if rqs['target']['prediction_type'].lower() == 'regression':
    scorer = make_scorer(mean_squared_error)
else:
    scorer = 'roc_auc'

params = repo_algo_dict[algo2use]
params.pop('model_name', None)
params.pop('is_selected', None)
params

pairs, single_elements = find_min_max_pairs(list(params.keys()))
print("Pairs:", pairs)
print("Single Elements:", single_elements)

'Random Forest Regressor'

True

{'min_trees': 10,
 'max_trees': 20,
 'feature_sampling_statergy': 'Default',
 'min_depth': 20,
 'max_depth': 25,
 'min_samples_per_leaf_min_value': 5,
 'min_samples_per_leaf_max_value': 10,
 'parallelism': 0}

Pairs: [('min_trees', 'max_trees'), ('min_depth', 'max_depth'), ('min_samples_per_leaf_min_value', 'min_samples_per_leaf_max_value')]
Single Elements: ['parallelism', 'feature_sampling_statergy']


In [20]:
hp_grid = {}
for min_item,max_item in pairs:
    if 'trees' in max_item:
        hp_grid['n_estimators'] = list(range(params[min_item], params[max_item]+1))
    elif 'sample' in max_item:
        hp_grid['min_samples_leaf'] = list(range(params[min_item], params[max_item]+1))
    elif 'depth' in max_item:
        hp_grid['max_depth'] = list(range(params[min_item], params[max_item]))
    else:
        raise ValueError
        print('hp name not supported')

hp_grid

{'n_estimators': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 'max_depth': [20, 21, 22, 23, 24],
 'min_samples_leaf': [5, 6, 7, 8, 9, 10]}

In [21]:
grid_search_params = rqs['hyperparameters']
grid_search_params.pop('stratergy', None)
grid_search_params

init_model = loaded_sklearn_model(random_state= int(grid_search_params['random_state']))
init_model.get_params()

cv_model = GridSearchCV(init_model, 
             hp_grid, 
             scoring = scorer, 
             n_jobs= int(grid_search_params['parallelism']), 
             refit=True, 
             cv = int(grid_search_params['num_of_folds']), #integer, to specify the number of folds in a (Stratified)KFold,
             verbose=0, 
            #  error_score=nan, 
             return_train_score=False)

'Grid Search'

{'shuffle_grid': True,
 'random_state': 1,
 'max_iterations': 2,
 'max_search_time': 3,
 'parallelism': 5,
 'cross_validation_stratergy': 'Time-based K-fold(with overlap)',
 'num_of_folds': 6,
 'split_ratio': 0,
 'stratified': True}

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [22]:
cv_model.fit(iris_data_subr.drop(columns = [traget_name]), 
             iris_data_subr[traget_name])

In [23]:
#sorted(cv_model.cv_results_.keys())
cv_table = pd.DataFrame(cv_model.cv_results_)[['params','mean_test_score']].sort_values('mean_test_score', ascending=True)
cv_table

Unnamed: 0,params,mean_test_score
0,"{'max_depth': 20, 'min_samples_leaf': 5, 'n_es...",0.030078
264,"{'max_depth': 24, 'min_samples_leaf': 5, 'n_es...",0.030078
66,"{'max_depth': 21, 'min_samples_leaf': 5, 'n_es...",0.030078
132,"{'max_depth': 22, 'min_samples_leaf': 5, 'n_es...",0.030078
198,"{'max_depth': 23, 'min_samples_leaf': 5, 'n_es...",0.030078
...,...,...
193,"{'max_depth': 22, 'min_samples_leaf': 10, 'n_e...",0.042205
325,"{'max_depth': 24, 'min_samples_leaf': 10, 'n_e...",0.042205
259,"{'max_depth': 23, 'min_samples_leaf': 10, 'n_e...",0.042205
61,"{'max_depth': 20, 'min_samples_leaf': 10, 'n_e...",0.042205
