# Importing libraries

In [104]:
import pandas as pd
import numpy as np
import json
import striprtf
import sklearn
from striprtf.striprtf import rtf_to_text
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import warnings
warnings.filterwarnings('ignore')

# Parsing rtf to json

In [105]:
#opening .rtf file in read mode. read and convert rtf to text using rtf_to_text method from striprtf library
with open("algoparams_from_ui.json.rtf","r") as f1:
    rtf_str=f1.read()
    text=rtf_to_text(rtf_str)

In [106]:
#converted text contains \n in text foemat we need "new line" instead of \n
#so we write the converted text in text file to remove \n from text
with open("j_file.txt","w") as f2:
    f2.write(text)

In [107]:
#loading text data in json format and we have a dictionary object json_data
with open("j_file.txt","r") as f3:
    json_data=json.load(f3)

In [108]:
#making json file
with open("j_file.json","w") as f4:
    json.dump(json_data,f4,indent=4)

# Reading target and type of regression to be run

In [109]:
with open("j_file.json","r") as f1:
    json_data=json.load(f1)

In [110]:
#reading target configuration
target=json_data["design_state_data"]["target"]
#target column
target_column=json_data["design_state_data"]["target"]["target"]
#prediction type
pred_type=json_data["design_state_data"]["target"]["prediction_type"]
#model type
mod_type=json_data["design_state_data"]["target"]["type"]
#feature handling configuration
feature_handling=json_data["design_state_data"]["feature_handling"]
#reading data
df=pd.read_csv("iris.csv")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Feature Handling, Application of Imputation on Iris data

In [111]:
for feature, config in feature_handling.items():
    if config["is_selected"] and config["feature_variable_type"]=="numerical":
        impute_values = config['feature_details']['missing_values']
        if impute_values=="Impute":
            impute_with=config["feature_details"]["impute_with"]
            if impute_with=="Average of values":
                df[feature]=df[feature].fillna(df[feature].mean)
            elif impute_with=="Median":
                df[feature]=df[feature].fillna(df[feature].median)
            elif impute_with=="Most frequent values":
                df[feature]=df[feature].fillna(df[feature].mode)
            elif impute_with=="custom":
                df[feature]=df[feature].fillna(config["feature_details"]["impute_value"])
            else:
                print(f"Warning: {impute_with} is not supported by feature {feature}. Imputation is skipped")
        elif impute_values=="keep as is":
            pass
        elif impute_values=="Drop feature":
            df=df.drop(columns=[feature],errors="ignore")
        
    elif config["is_selected"] and config["feature_variable_type"]=="text":
        impute_values = config['feature_details']['text_handling']
        hash_columns=config['feature_details']['hash_columns']
        if impute_values == 'Tokenize and hash': #and hash_columns==0:
            df[feature],a=pd.factorize(df[feature])
        else:
            print("Encoding cannot be done")
    else:
        print(f"Warning: {impute_values} not supported for feature {feature}. Handling missing value is skipped")

# Feature Reduction

In [112]:
feature_reduction=json_data["design_state_data"]["feature_reduction"]

In [113]:
def corr_with_target_reduction(df,target_column,final_num_feature):
    corrs = df.corr()[target_column].abs().sort_values(ascending=False)
    top_features = corrs.head(final_num_feature+1).index.tolist()
    return df[top_features]
def tree_based_reduction(df,target_column,final_num_feature,num_of_trees,depth_of_trees):
    if final_num_feature<df.shape[1]:
        try:
            x=df.drop(columns=[target_column],errors="ignore")
            y=df[target_column]
            model=RandomForestRegressor(n_estimators=num_of_trees, max_depth=depth_of_trees, random_state=0)
            model.fit(x,y)
            sel = SelectFromModel(model, max_features=final_num_feature)
            x_red = sel.fit_transform(x,y)
            top_features=x.columns[sel.get_support()]
            return df[[target_column]+list(top_features)]
        except KeyError:
            print("Target column not found skipping tree-based reduction")
            return df
    else:
        return df
def PCA_reduction(df,target_column,final_num_feature):
    if final_num_feature < df.shape[1] -1:
        try:
            x=df.drop(columns=[target_column],errors="ignore")
            y=df[target_column]
            model=PCA(n_components=final_num_feature)
            x_reduced=model.fit_transform(x)
            pc_cols=[f"PC{i+1}" for i in range(x_reduced.shape[1])]
            x_reduced_df = pd.DataFrame(x_reduced, columns=pc_cols, index=df.index)
            return pd.concat([x_reduced_df, y], axis=1)
        except KeyError:
            print("Target Column not found skipping PCA reduction")
            return df
    else:
        return df

reduction_method=feature_reduction["feature_reduction_method"]
final_num_feature=int(feature_reduction['num_of_features_to_keep'])-1
if reduction_method=="Corr with Target":
    df=corr_with_target_reduction(df.copy(),target_column,final_num_feature)
elif reduction_method=="Tree-based":
    num_of_trees=int(feature_reduction['num_of_trees'])
    depth_of_trees=int(feature_reduction['depth_of_trees'])
    df=tree_based_reduction(df.copy(),target_column,final_num_feature,num_of_trees,depth_of_trees)
elif reduction_method=="PCA":
    df=PCA_reduction(df.copy(),target_column,final_num_feature)
else:
    print(f"Warning: Feature reduction method '{reduction_method}' not supported. Skipping feature reduction.")

# Model making

In [114]:
algorithms_config=json_data["design_state_data"]["algorithms"]


In [115]:

def model_training_eval(df,algo_config,prediction_type,target_column,algo_name):
    n_jobs=-1
    if not algo_config['is_selected']:
        print(f"{algo_config['model_name']} is not selected. Skipping.")
        return
    x=df.drop(columns=target_column,errors="ignore")
    y=df[target_column]
    for col in x.select_dtypes(include=np.number).columns:
        x[col] = x[col].astype(float)
    if y.dtype != float: # Ensure target is also float
        y = y.astype(float)
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=35)
    mod_name=algo_config["model_name"]
    print(f"{mod_name}")
    if pred_type == "Regression":
        #we cant control number of iterations in linearRegression explicitly therefore we use SGDRegressor
        if mod_name == "LinearRegression" and algo_name=="LinearRegression":
            model=SGDRegressor()
            #max_iter will take all possible integres given in range
            #regparam, elasticnet are closely related with alpha and l1_ratio respectively
            param_grid={
                "max_iter": list(np.arange(algo_config["min_iter"],algo_config["max_iter"]+1)),
                "alpha": list(np.arange(algo_config["min_regparam"],algo_config["max_regparam"]+0.01,0.1)),
                "l1_ratio": list(np.arange(algo_config["min_elasticnet"],algo_config["max_elasticnet"]+0.01,0.1))
            }
            #parallelism=0 => n_jobs=-1(uses all cpu cores)
            #parallelism=n_jobs for value greater than 0
            #we cant set n_jobs=0
            if algo_config["parallelism"] == 0:
                n_jobs=-1
            else:
                n_jobs=algo_config["parallelism"]
            
            """
            model=LinearRegression()
            param_grid={
                "fit_intercept":[True,False],
                "positive":[True,False]
            }
            """
            
        elif mod_name=="Random Forest Regressor" and algo_name=="RandomForestRegressor":
            model = RandomForestRegressor(random_state=42)
            #number of trees and samples_per_leaf are corrosponds to n_estimator and min_sample_leaf
            #max_feature = 1 is assumed as default feature_sampling_strategy
            param_grid={
                "n_estimators":list(np.arange(algo_config["min_trees"],algo_config["max_trees"]+1)),
                "max_depth":list(np.arange(algo_config["min_depth"],algo_config["max_depth"]+1)),
                "min_samples_leaf":list(np.arange(algo_config["min_samples_per_leaf_min_value"],algo_config["min_samples_per_leaf_max_value"]+1)),
            }
            #parallelism=0 => n_jobs=-1(uses all cpu cores)
            #parallelism=n_jobs for value greater than 0
            #we cant set n_jobs=0
            if algo_config["parallelism"] == 0:
                n_jobs=-1
            else:
                n_jobs=algo_config["parallelism"]
        elif mod_name=="Gradient Boosted Trees" and algo_name=="GBTRegressor":
            model = GradientBoostingRegressor(random_state=42)
            #n_estimator => number of BoostingStages
            #step_size is learning_rate
            #spliting sample => subsample range is either (0.0,1.0] or [2,inf)  for min_sibsample=1 sample cannot be split further
            #"feature_sampling_statergy": "Fixed number"=> GBTRegressor does not explicitly map this parameter to max_features in its param_grid
            #"use_deviance": True, "use_exponential": False => GBTRegressor does not explicitly map this parameter to max_features in its param_grid
            #"fixed_number": 22 => GBTRegressor does not explicitly map this parameter to max_features in its param_grid
            #learning_rate => stepsize
            param_grid={
                "n_estimators": (algo_config["num_of_BoostingStages"]),
                "learning_rate": list(np.arange(algo_config["min_stepsize"],algo_config["max_stepsize"]+0.01,0.01)),
                #"min_samples_split":list(np.arange(algo_config["min_subsample"],algo_config["max_subsample"]+1)),
                "max_depth": list(np.arange(algo_config["min_depth"],algo_config["max_depth"]+1))
            }
        elif mod_name == "RidgeRegression" and algo_name=="RidgeRegression":
            model = Ridge()
            # you intend to provide specific values or a range for the regularization parameter.
            #alpha=>regparam
            param_grid = {
                "alpha": list(np.arange(algo_config["min_regparam"],algo_config["max_regparam"]+0.01,0.1)),
                "max_iter":list(np.arange(algo_config["min_iter"],algo_config["max_iter"]+1)),
            }
        elif mod_name == 'Lasso Regression' and algo_name=="LassoRegression":
            model = Lasso()
            # you intend to provide specific values or a range for the regularization parameter.
            #alpha=>regparam
            param_grid = {
                "max_iter": list(np.arange(algo_config["min_iter"],algo_config["max_iter"]+1)),
                "alpha": list(np.arange(algo_config["min_regparam"],algo_config["max_regparam"]+0.01,0.1)),
                "selection": ["cyclic", "random"],
            }
        elif algo_name=="ElasticNetRegression":
            model = ElasticNet()
            print("a")
            param_grid = {
                "alpha": list(np.arange(algo_config["min_regparam"],algo_config["max_regparam"]+0.01,0.1)),
                "l1_ratio": list(np.arange(algo_config["min_elasticnet"],algo_config["max_elasticnet"]+0.01,0.1)),
                "max_iter": list(np.arange(algo_config["min_iter"],algo_config["max_iter"]+1))
            }
        #There is no gini, entropy criterion for DecisionTreeRegressor
        elif mod_name == 'Decision Tree' and algo_name=="DecisionTreeRegressor":   
            model = DecisionTreeRegressor(random_state=42)
            #"use_best": True, "use_random": True: => Decision Tree does not explicitly map these parameters to the splitter
            splitter=[]
            if algo_config["use_best"]:
                splitter.append("best")
            if algo_config["use_random"]:
                splitter.append("random")
            param_grid = {
                "max_depth": list(np.arange(algo_config["min_depth"],algo_config["max_depth"]+1)),
                "min_samples_leaf": algo_config["min_samples_per_leaf"],
                "criterion": ["squared_error", "friedman_mse", "absolute_error"],
                "splitter": splitter
            }
        else:
            print(f"Warning: Algorithm '{mod_name}' not supported for regression. Skipping.")
            return
    else:
        print(f"Warning: Prediction type '{prediction_type}' not supported. Skipping.")
        return
    pipeline=Pipeline([("scaler",StandardScaler),
                      ("model",model)])
    num_of_folds=json_data["design_state_data"]["hyperparameters"]["num_of_folds"]
    n_jobs=json_data["design_state_data"]["hyperparameters"]["parallelism"]
    grid_search_cv=GridSearchCV(model,param_grid=param_grid,cv=num_of_folds,scoring="neg_mean_squared_error",verbose=3,n_jobs=n_jobs)
    grid_search_cv.fit(x_train,y_train)
    best_est=grid_search_cv.best_estimator_
    best_params=grid_search_cv.best_params_
    y_pred=best_est.predict(x_test)
    mse=mean_squared_error(y_test,y_pred)
    r2s=r2_score(y_test,y_pred)
    mae=mean_absolute_error(y_test,y_pred)
    print(f"Best parameters:{best_params}")
    print(f"mse = {mse}")
    print(f"r2s = {r2s}")
    print(f"mae = {mae}")
for algo_name,algo_config in algorithms_config.items():
    #passing data, prediction type,target, algorithm name and its configuration
    model_training_eval(df.copy(),algo_config,pred_type,target_column,algo_name)


Random Forest Classifier is not selected. Skipping.
Random Forest Regressor
Fitting 6 folds for each of 396 candidates, totalling 2376 fits
Best parameters:{'max_depth': 20, 'min_samples_leaf': 8, 'n_estimators': 18}
mse = 0.04737531527594438
r2s = 0.9249064503390991
mae = 0.16763632807942688
Gradient Boosted Trees is not selected. Skipping.
Gradient Boosted Trees is not selected. Skipping.
LinearRegression is not selected. Skipping.
LogisticRegression is not selected. Skipping.
RidgeRegression is not selected. Skipping.
Lasso Regression is not selected. Skipping.
Lasso Regression is not selected. Skipping.
XG Boost is not selected. Skipping.
Decision Tree is not selected. Skipping.
Decision Tree is not selected. Skipping.
Support Vector Machine is not selected. Skipping.
Stochastic Gradient Descent is not selected. Skipping.
KNN is not selected. Skipping.
Extra Random Trees is not selected. Skipping.
Neural Network is not selected. Skipping.
