In [1]:
# ! pip install striprtf -q

In [1]:
import pandas as pd
import json
from striprtf.striprtf import rtf_to_text

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score


In [2]:
file_path = "algoparams_from_ui1.json.rtf"

In [3]:
def rtf_parser(rtf_file_path, encoding='utf-8'):
    from striprtf.striprtf import rtf_to_text
    with open(rtf_file_path, 'r', encoding=encoding) as rtf_file:
        plain_text = rtf_to_text(rtf_file.read())
    return plain_text

In [4]:
def rtf_to_json_parser(rtf_file_path):
    import json
    plain_text = rtf_parser(rtf_file_path)
    json_data = json.loads(plain_text)
    return json_data

In [15]:
def get_selected_features_and_details(json_data):
    selected_features  = []
    feature_details = {}
    feature_handling = json_data["design_state_data"]["feature_handling"]
    for feature, details in feature_handling.items():
        if(details["is_selected"]):
            name = details["feature_name"]
            selected_features.append(name)
            feature_details[name] = details
    selected_features.remove(target_variable)
    return selected_features, feature_details

In [13]:
def get_split_dataset(json_data):
    design_state = json_data["design_state_data"]
    dataset = design_state["session_info"]["dataset"]
    target_variable = design_state["target"]["target"]
    
    train_info = design_state["train"]
    train_ratio = train_info["train_ratio"]
    random_seed = train_info["random_seed"]

    selected_features, _ = get_selected_features_and_details(json_data)

    df = pd.read_csv(dataset)
    X = df[selected_features]
    Y = df[target_variable]

In [16]:
def get_problem_type_and_target_variable(json_data):
    design_state = json_data["design_state_data"]
    problem_type  = design_state["target"]["prediction_type"]
    target_variable = design_state["target"]["target"]
    return problem_type,target_variable

In [5]:
json_data = rtf_to_json_parser(file_path)

In [6]:
design_state = json_data["design_state_data"]
dataset = design_state["session_info"]["dataset"]
problem_type  = design_state["target"]["prediction_type"]

df = pd.read_csv(dataset)

In [7]:
# get feature names and feature preprocessing
feature_handling = json_data["design_state_data"]["feature_handling"]
selected_features  = []
feature_details = {}
for feature, details in feature_handling.items():
    if(details["is_selected"]):
        name = details["feature_name"]
        selected_features.append(name)
        feature_details[name] = details
selected_features.remove(target_variable)

In [10]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=train_ratio, random_state=random_seed)

In [11]:
algorithms = design_state["algorithms"]

for algo, details in algorithms.items():
    if details["is_selected"]:
        
        print(algo)
        print(details)
        print("------------>")

RandomForestClassifier
{'model_name': 'Random Forest Classifier', 'is_selected': True, 'min_trees': 10, 'max_trees': 30, 'feature_sampling_statergy': 'Default', 'min_depth': 20, 'max_depth': 30, 'min_samples_per_leaf_min_value': 5, 'min_samples_per_leaf_max_value': 50, 'parallelism': 0}
------------>
DecisionTreeClassifier
{'model_name': 'Decision Tree', 'is_selected': True, 'min_depth': 4, 'max_depth': 7, 'use_gini': False, 'use_entropy': True, 'min_samples_per_leaf': [12, 6], 'use_best': True, 'use_random': False}
------------>


In [12]:
model_selection = {
    "RandomForestClassifier" : RandomForestClassifier(random_state=random_seed),
    "RandomForestRegressor" : RandomForestRegressor(random_state=random_seed),
    "LinearRegression": LinearRegression(),
    "LogisticRegression": LogisticRegression(random_state=random_seed),
    "RidgeRegression": Ridge(),
    "LassoRegression": Lasso(),
    "ElasticNetRegression": ElasticNet(),
    "xg_boost": "ds",
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_seed),
    "DecisionTreeClassifier":DecisionTreeClassifier(random_state=random_seed),
    "SVM": SVC(random_state=random_seed),
    "KNN": KNeighborsClassifier(),
    "neural_network": MLPClassifier(random_state=random_seed)    
}

In [30]:
from sklearn.feature_extraction.text import HashingVectorizer
import numpy as np

# Sample dataset
species_data = ['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor']

# Create a HashingVectorizer with a default hash space size (e.g., 1024)
hashing_vectorizer = HashingVectorizer(n_features=3, alternate_sign=False)

# Convert the species data to hashed values
hashed_species = hashing_vectorizer.transform(species_data)

# Convert the sparse matrix to a dense matrix for easier handling (optional)
hashed_species_dense = hashed_species.toarray()

# Output the processed data
print("Processed (hashed) species data:\n", hashed_species_dense)


Processed (hashed) species data:
 [[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [29]:
hashed_species.toarray()

array([[0.70710678, 0.70710678],
       [0.70710678, 0.70710678],
       [0.70710678, 0.70710678],
       [0.70710678, 0.70710678],
       [0.70710678, 0.70710678],
       [0.70710678, 0.70710678]])

In [None]:
# Extract hyperparameters from the JSON data
param_grid = {}
for param, value in hyperparameters.items():
    if isinstance(value, list):
        param_grid[param] = value
    else:
        param_grid[param] = [value]

# Step 6: Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy' if problem_type == 'Classification' else 'neg_mean_squared_error')
grid_search.fit(X_train, Y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Step 7: Evaluate the model based on the type of problem
if problem_type == 'Classification':
    # Evaluate using confusion matrix and classification report
    Y_pred = best_model.predict(X_val)
    conf_matrix = confusion_matrix(Y_val, Y_pred)
    class_report = classification_report(Y_val, Y_pred)
    print("Confusion Matrix:\n", conf_matrix)
    print("\nClassification Report:\n", class_report)
else:  # Regression
    # Evaluate using R-squared, adjusted R-squared, and RMSE
    Y_pred = best_model.predict(X_val)
    mse = mean_squared_error(Y_val, Y_pred)
    rmse = np.sqrt(mse)
    r_squared = r2_score(Y_val, Y_pred)
    n = len(Y_val)
    p = X_val.shape[1]
    adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
    
    print("R-squared:", r_squared)
    print("Adjusted R-squared:", adj_r_squared)
    print("Root Mean Squared Error (RMSE):", rmse)