# import the models from the config


In [1]:
import yaml
import importlib
import os
import joblib
import pickle
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import os
import pickle
from sklearn.metrics import f1_score
import copy
import pandas as pd
import plotly.express as px



# Set the maximum width of the display for strings
pd.set_option('display.max_colwidth', 30)



def load_models_from_config(config_file):
    with open(config_file, 'r') as file:
        config = yaml.safe_load(file)
    
    models = {}
    for model_name, model_config in config['models'].items():
        steps = []
        for step_config in model_config['steps']:
            module = importlib.import_module(step_config['module'])
            class_ = getattr(module, step_config['class'])
            if 'strategy' in step_config:
                step = (step_config['name'], class_(strategy=step_config['strategy']))
            else:
                step = (step_config['name'], class_())
            steps.append(step)

        # Add GridSearchCV if present in the config
        if 'grid_search' in model_config:
            param_grid = model_config['grid_search']['param_grid']
            cv = model_config['grid_search']['cv']
            scoring = model_config['grid_search']['scoring']
            grid_search = GridSearchCV(Pipeline(steps), param_grid, cv=cv, scoring=scoring)
            models[model_name] = grid_search
        else:
            models[model_name] = Pipeline(steps)
    
    
    return models

# Load models from YAML configuration
models = load_models_from_config('models_config.yaml')


# Train the models

In [2]:

# Path to the train data folder
train_data_folder = 'data/train'

# Create a dictionary to store trained models
trained_models = {}

# Iterate over files in train data folder
for filename in os.listdir(train_data_folder):
    file_path = os.path.join(train_data_folder, filename)
    
    # Load data from .pkl file
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
        X_train, y_train = data.drop(columns='classes'), data['classes'].astype("category")
    
    # Iterate over models
    for model_name, model_pipeline in models.items():
        # Create a copy of the model pipeline
        model = copy.deepcopy(model_pipeline)
        
        # Train model
        model.fit(X_train, y_train)
        
        # Save trained model in dictionary with keys based on filename and model name
        key = f'{filename}_{model_name}'
        trained_models[key] = model
    


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [3]:
# Path to the testing data folder
test_data_folder = 'data/test'

# Dictionary to store F1 micro scores for each model
f1_scores = {}

res = []

# Iterate over files in test data folder
for filename in os.listdir(test_data_folder):
    file_path = os.path.join(test_data_folder, filename)
    
    # Load data from .pkl file
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
        X_test, y_test = data.drop(columns='classes'), data['classes'].astype("category")
    
    # Extract model name from the filename
    model_name, _ = filename.split('.pkl')
    model_name, _ = (model_name + '|').split('|')
    
    # Get the trained model corresponding to the model name
    model_keys = [key for key in trained_models if key.startswith(model_name)]

    for model_key in model_keys:

        model = trained_models.get(model_key)
        
        # Predict labels using the model
        y_pred = model.predict(X_test)
        
        # Calculate F1 micro score
        f1_micro = f1_score(y_test, y_pred, average='micro')
        
        res.append([model_key, f1_micro])



In [4]:
df = pd.DataFrame(res,columns=['name', 'f1_micro'])


df[['dataset','model']] = df['name'].str.split('.pkl_', expand=True)
df[['dataset','pollution']] = df['dataset'].str.split('|', expand=True)

df = df.fillna(0)

In [5]:
# Extracting coefficients into separate columns
df['completeness'] = df['pollution'].str.extract(r'completness_pollution_([0-9.]+)')
df['accuracy'] = df['pollution'].str.extract(r'feature_accuracy_pollution_([0-9.]+)')
df['dupp_fac'] = df['pollution'].str.extract(r'duplicate_factor_([0-9]+)')

# Converting columns to appropriate data types
df['completeness'] = df['completeness'].astype(float)
df['accuracy'] = df['accuracy'].astype(float)
df['dupp_fac'] = df['dupp_fac'].astype(float)


# Ordering the DataFrame based on the columns
df = df.sort_values(by=['completeness', 'accuracy', 'dupp_fac'], ascending=True)

In [11]:
pol_col = df[['completeness', 'accuracy', 'dupp_fac']].columns
df_col = df.columns


groups = df.groupby('dataset')

for dataset, param in groups:
    
    param = param.sort_values(by=['completeness', 'accuracy', 'dupp_fac'], ascending=True, na_position='first')
    
    fig = px.line(param, x="pollution", y="f1_micro", color='model', title=param.dataset.iloc[0] )

    fig.update_xaxes(showticklabels=False)

    fig.show()
                  
                  

In [47]:
df[df[pol_col.difference(['completeness'])].isnull().all(axis=1) | ]

Unnamed: 0,name,f1_micro,dataset,model,pollution,completeness,accuracy,dupp_fac
60,threenormn_500_d_20|comple...,0.592,threenormn_500_d_20,Decision Tree,completness_pollution_0.4,0.4,,
61,threenormn_500_d_20|comple...,0.800,threenormn_500_d_20,Logistic Regression,completness_pollution_0.4,0.4,,
62,threenormn_500_d_20|comple...,0.832,threenormn_500_d_20,KNN,completness_pollution_0.4,0.4,,
189,shapesn_500|completness_po...,0.992,shapesn_500,Decision Tree,completness_pollution_0.4,0.4,,
190,shapesn_500|completness_po...,0.992,shapesn_500,Logistic Regression,completness_pollution_0.4,0.4,,
...,...,...,...,...,...,...,...,...
2086,threenormn_100_d_20.pkl_Lo...,0.800,threenormn_100_d_20,Logistic Regression,0,,,
2087,threenormn_100_d_20.pkl_KNN,0.840,threenormn_100_d_20,KNN,0,,,
2208,threenormn_100_d_5.pkl_Dec...,0.840,threenormn_100_d_5,Decision Tree,0,,,
2209,threenormn_100_d_5.pkl_Log...,0.840,threenormn_100_d_5,Logistic Regression,0,,,


In [42]:
df

Unnamed: 0,name,f1_micro,dataset,model,pollution,completeness,accuracy,dupp_fac
33,threenormn_500_d_20|duplic...,0.648,threenormn_500_d_20,Decision Tree,duplicate_factor_3_complet...,0.4,0.4,3.0
34,threenormn_500_d_20|duplic...,0.808,threenormn_500_d_20,Logistic Regression,duplicate_factor_3_complet...,0.4,0.4,3.0
35,threenormn_500_d_20|duplic...,0.808,threenormn_500_d_20,KNN,duplicate_factor_3_complet...,0.4,0.4,3.0
216,shapesn_500|duplicate_fact...,0.928,shapesn_500,Decision Tree,duplicate_factor_3_complet...,0.4,0.4,3.0
217,shapesn_500|duplicate_fact...,1.000,shapesn_500,Logistic Regression,duplicate_factor_3_complet...,0.4,0.4,3.0
...,...,...,...,...,...,...,...,...
2086,threenormn_100_d_20.pkl_Lo...,0.800,threenormn_100_d_20,Logistic Regression,0,,,
2087,threenormn_100_d_20.pkl_KNN,0.840,threenormn_100_d_20,KNN,0,,,
2208,threenormn_100_d_5.pkl_Dec...,0.840,threenormn_100_d_5,Decision Tree,0,,,
2209,threenormn_100_d_5.pkl_Log...,0.840,threenormn_100_d_5,Logistic Regression,0,,,


In [45]:
df[pol_col.difference(['completeness'])].isnull().all(axis=1)

accuracy    False
dupp_fac    False
dtype: bool

In [48]:
df[df[pol_col.difference(['completeness'])].isnull().all(axis=1) | df[pol_col.difference(['completness'])].isnull().any(axis=1)]

Unnamed: 0,name,f1_micro,dataset,model,pollution,completeness,accuracy,dupp_fac
60,threenormn_500_d_20|comple...,0.592,threenormn_500_d_20,Decision Tree,completness_pollution_0.4,0.4,,
61,threenormn_500_d_20|comple...,0.800,threenormn_500_d_20,Logistic Regression,completness_pollution_0.4,0.4,,
62,threenormn_500_d_20|comple...,0.832,threenormn_500_d_20,KNN,completness_pollution_0.4,0.4,,
189,shapesn_500|completness_po...,0.992,shapesn_500,Decision Tree,completness_pollution_0.4,0.4,,
190,shapesn_500|completness_po...,0.992,shapesn_500,Logistic Regression,completness_pollution_0.4,0.4,,
...,...,...,...,...,...,...,...,...
2086,threenormn_100_d_20.pkl_Lo...,0.800,threenormn_100_d_20,Logistic Regression,0,,,
2087,threenormn_100_d_20.pkl_KNN,0.840,threenormn_100_d_20,KNN,0,,,
2208,threenormn_100_d_5.pkl_Dec...,0.840,threenormn_100_d_5,Decision Tree,0,,,
2209,threenormn_100_d_5.pkl_Log...,0.840,threenormn_100_d_5,Logistic Regression,0,,,


In [51]:
pol_col = df[['completeness', 'accuracy', 'dupp_fac']].columns
df_col = df.columns


groups =df[df[pol_col.difference(['completeness'])].isnull().all(axis=1) | df[pol_col].isnull().all(axis=1)].groupby('dataset')

for dataset, param in groups:
    
    param = param.sort_values(by=['completeness', 'accuracy', 'dupp_fac'], ascending=True, na_position='first')
    
    fig = px.line(param, x="pollution", y="f1_micro", color='model', title=param.dataset.iloc[0] )

    fig.update_xaxes(showticklabels=False)

    fig.show()

In [53]:
pol_col = df[['completeness', 'accuracy', 'dupp_fac']].columns
df_col = df.columns


groups =df[df[pol_col.difference(['accuracy'])].isnull().all(axis=1) | df[pol_col].isnull().all(axis=1)].groupby('dataset')

for dataset, param in groups:
    
    param = param.sort_values(by=['completeness', 'accuracy', 'dupp_fac'], ascending=True, na_position='first')
    
    fig = px.line(param, x="pollution", y="f1_micro", color='model', title=param.dataset.iloc[0] )

    fig.update_xaxes(showticklabels=False)

    fig.show()

In [55]:
pol_col = df[['completeness', 'accuracy', 'dupp_fac']].columns
df_col = df.columns


groups =df[df[pol_col.difference(['dupp_fac'])].isnull().all(axis=1) | df[pol_col].isnull().all(axis=1)].groupby('dataset')

for dataset, param in groups:
    
    param = param.sort_values(by=['completeness', 'accuracy', 'dupp_fac'], ascending=True, na_position='first')
    
    fig = px.line(param, x="pollution", y="f1_micro", color='model', title=param.dataset.iloc[0] )

    fig.update_xaxes(showticklabels=False)

    fig.show()

In [56]:
kkt

NameError: name 'kkt' is not defined