In [None]:
# Author - Priti Gupta
# Submitted - 1st August 2023

In [1]:
import json
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


# Function to parse JSON and create pipelines
def parse_json_and_create_pipeline(json_data):
    data = json_data

    
    # Feature Reduction Pipeline
    feature_reduction_pipeline = PCA(n_components=int(data['design_state_data']['feature_reduction']['num_of_features_to_keep']))


    pipelines = []
    algorithms = data['design_state_data']['algorithms']
    for algo_name, algo_data in algorithms.items():
        if algo_data['is_selected']:
            # Model Training Pipeline
            if algo_name == 'RandomForestClassifier':
                model = RandomForestClassifier()
            elif algo_name == 'GradientBoostedTrees':
                model = GradientBoostingClassifier()
            elif algo_name == 'LogisticRegression':
                model = LogisticRegression()
            elif algo_name == 'XGBoost':
                model = XGBClassifier()
            elif algo_name == 'DecisionTreeClassifier':
                model = DecisionTreeClassifier()
            elif algo_name == 'SVM':
                model = SVC()
            elif algo_name == 'KNN':
                model = KNeighborsClassifier()
            elif algo_name == 'ExtraRandomTrees':
                model = ExtraTreesClassifier()
            elif algo_name == 'NeuralNetwork':
                model = MLPClassifier()

            pipeline = Pipeline([
                ('feature_reduction', feature_reduction_pipeline),
                ('model', model)
            ])
        
            pipelines.append(pipeline)
    return pipelines


# Function to check model accuracy using cross-validation
def check_model_accuracy(pipeline, X, y):
    y_pred = pipeline.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print('y_pred:')
    print(y_pred)
    print('y_test:')
    print(y)
    return accuracy

if __name__ == "__main__":
    # Load JSON data from file
    with open("algoparmas_from_ui.json") as json_file:
        json_data = json.load(json_file)

    # Load dataset
    dataset_filename = json_data["design_state_data"]["session_info"]["dataset"]
    dataset = pd.read_csv(dataset_filename)

    # Split features and target variable
    X = dataset.drop(columns=["species"]).values
    y = dataset["species"].values


    # Feature Handling Pipeline
    numerical_features = [feat_name for feat_name, feat_data in json_data['design_state_data']['feature_handling'].items() if feat_data['feature_variable_type'] == 'numerical']
    numerical_features_to_scale = [feat_name for feat_name, feat_data in json_data['design_state_data']['feature_handling'].items() if feat_data['feature_variable_type'] == 'numerical' and feat_data['feature_details']['rescaling'] == 'Rescaling']
    text_features = [feat_name for feat_name, feat_data in json_data['design_state_data']['feature_handling'].items() if feat_data['feature_variable_type'] == 'text']

    target_var = 'species'
    if target_var in text_features:
        #Label encode the target variable y
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)
    
    if numerical_features_to_scale in dataset.columns.to_list():
        sc = StandardScaler()
        sc.fit_transform(X[numerical_features_to_scale])

   # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    # Create pipelines based on JSON data
    pipelines = parse_json_and_create_pipeline(json_data)
   
    # Fit each pipeline using GridSearchCV (hyperparameter tuning) and check intermediate results
for idx, pipeline in enumerate(pipelines, start=1):
    print(f"Model -  {pipeline['model']}")

    # Grid Search CV
    param_grid = {
        'model__n_estimators': [50, 100, 150],
        'model__max_depth': [None, 5, 10],
        'model__min_samples_split': [2, 5],
    }
    hyperparameters = json_data["design_state_data"]["hyperparameters"]
    parallelism = hyperparameters.get("parallelism", None)
    num_of_folds = hyperparameters.get("num_of_folds", None)

    grid_search = GridSearchCV(pipeline, param_grid, cv=num_of_folds,scoring = 'accuracy',n_jobs=parallelism)
    grid_search.fit(X_train, y_train)

    accuracy = check_model_accuracy(grid_search, X_test, y_test)
    #print("Best Parameters obtained from grid_search are:")
    #print(grid_search.best_params_)
    print(f"Accuracy: {accuracy}")
    print("--------------------------")

Model -  RandomForestClassifier()
y_pred:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
y_test:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Accuracy: 1.0
--------------------------
