In [1]:
import json
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import HashingVectorizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Helper functions

In [2]:
def tokenize_and_hash(df, column_name):
    #tokenize the text column
    tokens = df[column_name].apply(word_tokenize)
    #hash the tokens
    hash_vectorizer = HashingVectorizer(n_features=5)       #Default number of columns for tokenization is 5
    hashed_features = hash_vectorizer.transform(tokens.apply(' '.join))
    hashed_df = pd.DataFrame(hashed_features.toarray(), columns=[f'{column_name}_feature_{i+1}' for i in range(5)])
    #replace the text columns with newly created hashs
    df = df.drop(columns = [column_name])
    return pd.concat([df, hashed_df], axis=1)

In [3]:
def transform_df(df, features):
    for i in features:
        if features[i]["is_selected"]:
            if features[i]["feature_variable_type"]=="numerical":
                df[i] = pd.to_numeric(df[i])
                if features[i]["feature_details"]["impute_with"]=="Average of values":
                    mean  = df[i].mean()
                    df[i] = df[i].fillna(mean)
                else:
                    df[i] = df[i].fillna(int(features[i]["feature_details"]["impute_value"]))
           
            if features[i]["feature_variable_type"]=="text":
                df = tokenize_and_hash(df, i)
        else:
            df = df.drop(columns=[i])   
    return df

In [4]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 

def reduce(target, df, reduction_dictionary):

    if reduction_dictionary["feature_reduction_method"] == "Correlation with target":
        num_of_features_to_keep = reduction_dictionary["num_of_features_to_keep"]
        correlations = df.corr()[target]
        sorted_features = correlations.abs().sort_values(ascending=False)
        top_features = sorted_features.head(int(num_of_features_to_keep)+1).index
        new_df = df[top_features]
        new_df[target] = df[target]
        return new_df

    elif reduction_dictionary["feature_reduction_method"] == "Tree-based":
        num_of_features_to_keep = reduction_dictionary["num_of_features_to_keep"]
        depth_of_trees = reduction_dictionary["depth_of_trees"]
        num_of_trees = reduction_dictionary["num_of_trees"]
        model = RandomForestRegressor(n_estimators = int(num_of_trees), max_depth = int(depth_of_trees))
        model.fit(df.drop(columns = [target]).values, df[target].values)
        feature_importance = model.feature_importances_
        sorted_indices = feature_importance.argsort()[::-1]
        sorted_features = df.drop(columns = [target]).columns[sorted_indices]
        selected_features = sorted_features[:int(num_of_features_to_keep)]
        new_df = df[selected_features]
        new_df[target] = df[target]
        return new_df
    
    elif reduction_dictionary["feature_reduction_method"] == "Principal Component Analysis":
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(df.drop(columns = [target]).values)
        pca = PCA(n_components=int(reduction_dictionary["num_of_features_to_keep"]))
        pca_result = pca.fit_transform(scaled_data)
        components = pca.components_
        new_column_names = [f'PC{i+1}' for i in range(int(reduction_dictionary["num_of_features_to_keep"]))]
        pca_df = pd.DataFrame(data=pca_result, columns=new_column_names)
        pca_df[target] = df[target]
        return pca_df

    elif reduction_dictionary["feature_reduction_method"] == "No Reduction":
        return df
        

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor

def build_models(models, data):

    model_pipelines = {}
    for model in models:
        model_param = data['design_state_data']['algorithms'][model]
        
        if model == "RandomForestRegressor":
            model = RandomForestRegressor()
            model_pipelines["RandomForestRegressor"] = model
    
    
        elif model == "LinearRegression":
            model = LinearRegression()
            model_pipelines["LinearRegression"] = model
        
    
        elif model == "RidgeRegression":
            model = Ridge(
                    alpha = float(model_param["regularization_term"]))
            model_pipelines["RidgeRegression"] = model
    
        elif model == "LassoRegression":
            model = Lasso(
                    max_iter = int(model_param["max_iter"]),
                    alpha = float(model_param["regularization_term"]))
            model_pipelines["LassoRegression"] = model
    
        elif model == "ElasticNetRegression":
            model = ElasticNet(
                    max_iter = int(model_param["max_iter"]),
                    alpha = float(model_param["regularization_term"]))
            model_pipelines["ElasticNetRegression"] = model
    
        elif model == "DecisionTreeRegressor":
            model = DecisionTreeRegressor(
                    max_depth = int(model_param["max_depth"]))
            model_pipelines["DecisionTreeRegressor"] = model

    return model_pipelines


In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

def grid_search_cv_pipeline(pipeline_obj,data):
    for model in pipeline_obj:
        model_param = data['design_state_data']['algorithms'][model]
        if model == 'RandomForestRegressor':
            reg = make_pipeline(StandardScaler(),GridSearchCV(
                        estimator = pipeline_obj[model],
                        param_grid = {'n_estimators'     : [int(model_param["min_trees"])                      ,  int(model_param["max_trees"])],
                                      'max_depth'        : [int(model_param["min_depth"])                      ,  int(model_param["max_depth"])],
                                      'min_samples_leaf' : [int(model_param["min_samples_per_leaf_min_value"]) ,  int(model_param["min_samples_per_leaf_max_value"])]
                                     }
            ))
            pipeline_obj[model] = reg

        #TEST LATER
        elif model == "LinearRegression":
            reg = make_pipeline(StandardScaler(),GridSearchCV(
                        estimator = pipeline_obj[model],
                        param_grid = {}
            ))
            pipeline_obj[model] = reg        
    
        elif model == "RidgeRegression":
            reg = make_pipeline(StandardScaler(),GridSearchCV(
                        estimator = pipeline_obj[model],
                        param_grid = {'max_iter' : [int(model_param["min_iter"])       ,   int(model_param["max_iter"])],
                                      'alpha'    : [float(model_param["min_regparam"]) ,   float(model_param["max_regparam"])]}
            ))
            pipeline_obj[model] = reg 
    
        elif model == "LassoRegression":
            reg = make_pipeline(StandardScaler(),GridSearchCV(
                        estimator = pipeline_obj[model],
                        param_grid = {'max_iter' : [int(model_param["min_iter"])       ,   int(model_param["max_iter"])],
                                      'alpha'    : [float(model_param["min_regparam"]) ,   float(model_param["max_regparam"])]}
            ))
            pipeline_obj[model] = reg 
    
        elif model == "ElasticNetRegression":
            reg = make_pipeline(StandardScaler(),GridSearchCV(
                        estimator = pipeline_obj[model],
                        param_grid = {'max_iter' : [int(model_param["min_iter"])        ,   int(model_param["max_iter"])],
                                      'alpha'    : [float(model_param["min_regparam"])  ,   float(model_param["max_regparam"])],
                                      'l1_ratio' : [float(model_param["min_elasticnet"]),   float(model_param["min_elasticnet"])]}
            ))
            pipeline_obj[model] = reg 
    
        elif model == "DecisionTreeRegressor":
            reg = make_pipeline(StandardScaler(),GridSearchCV(
                        estimator = pipeline_obj[model],
                        param_grid = {'max_depth'        : [int(model_param["min_depth"]) , int(model_param["max_depth"])],
                                      'min_samples_leaf' : model_param['min_samples_per_leaf']}
            ))
            pipeline_obj[model] = reg 
    return pipeline_obj

## Read algoparams_from_ui.json

In [8]:
from sklearn.model_selection import train_test_split

with open("assets/algoparams_from_ui.json", 'r') as file:
    data = json.load(file)

## Read CSV

In [9]:
df = pd.read_csv("assets/iris.csv")
target = data['design_state_data']['target']['target']
prediction_type = data['design_state_data']['target']['prediction_type']
features = data['design_state_data']["feature_handling"]

## Untransformed Dataframe

In [10]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Transformed Dataframe

In [11]:
df = transform_df(df, features)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_feature_1,species_feature_2,species_feature_3,species_feature_4,species_feature_5
0,5.1,3.5,1.4,0.2,0.0,-1.0,0.0,0.0,0.0
1,4.9,3.0,1.4,0.2,0.0,-1.0,0.0,0.0,0.0
2,4.7,3.2,1.3,0.2,0.0,-1.0,0.0,0.0,0.0
3,4.6,3.1,1.5,0.2,0.0,-1.0,0.0,0.0,0.0
4,5.0,3.6,1.4,0.2,0.0,-1.0,0.0,0.0,0.0


## Dataframe after feature reduction 

In [12]:
df = reduce(target, df, data['design_state_data']['feature_reduction'])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[target] = df[target]


Unnamed: 0,petal_length,species_feature_2,species_feature_3,sepal_width,petal_width
0,1.4,-1.0,0.0,3.5,0.2
1,1.4,-1.0,0.0,3.0,0.2
2,1.3,-1.0,0.0,3.2,0.2
3,1.5,-1.0,0.0,3.1,0.2
4,1.4,-1.0,0.0,3.6,0.2


## Build models

In [13]:
models = ["RandomForestRegressor", "GBTRegressor", "LinearRegression", "RidgeRegression", "LassoRegression", "ElasticNetRegression", "DecisionTreeRegressor"]
X = df.drop(columns = [target]).values
y = df[target].values
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

model_pipelines = build_models(models,data)
pipelines = grid_search_cv_pipeline(model_pipelines, data)

## Predict

In [14]:
for i in pipelines:
    pipelines[i].fit(X_train, y_train)
    print(f"Accuracy of {i} : {pipelines[i].score(X_test, y_test)}")
    best_estimator = pipelines[i].named_steps['gridsearchcv'].best_estimator_
    print(f"Model metrics are : {best_estimator}")
    print("\n")

Accuracy of RandomForestRegressor : 0.9252343675951119
Model metrics are : RandomForestRegressor(max_depth=20, min_samples_leaf=5, n_estimators=20)


Accuracy of LinearRegression : 0.9341241341317029
Model metrics are : LinearRegression()


Accuracy of RidgeRegression : 0.9333368914639942
Model metrics are : Ridge(alpha=0.8, max_iter=30)


Accuracy of LassoRegression : 0.5103904656519349
Model metrics are : Lasso(alpha=0.5, max_iter=30)


Accuracy of ElasticNetRegression : 0.7542938487640625
Model metrics are : ElasticNet(alpha=0.5, max_iter=30)


Accuracy of DecisionTreeRegressor : 0.9083395073623918
Model metrics are : DecisionTreeRegressor(max_depth=4, min_samples_leaf=6)


