####Installing anvil-uplink to allow connection to our Anvil project

In [None]:
!pip install anvil-uplink

Collecting anvil-uplink
  Downloading anvil_uplink-0.5.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting argparse (from anvil-uplink)
  Downloading argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting ws4py-sslupdate (from anvil-uplink)
  Downloading ws4py_sslupdate-0.5.1b0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading anvil_uplink-0.5.2-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Downloading ws4py_sslupdate-0.5.1b0-py2.py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ws4py-sslupdate, argparse, anvil-uplink
Successfully installed anvil-uplink-0.5.2 argparse-1.4.0 ws4py-sslupdate-0.5.1b0


###Importing the necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import userdata
import random
import io
import base64
import matplotlib.pyplot as plt

In [None]:
import anvil.server
#Key must be hidden, only visible before deployment
anvil.server.connect("server_DTCDOGSFUVQE5ZWTRAZ2BK4C-4CTB26IRHB44WTGL")



df = None
target = None
problem_type = None

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default Environment" as SERVER


###Reading the csv file, storing it in a temp file and returning its details to anvil

In [None]:
@anvil.server.callable
def process_csv(media_file):

    global df
    # Save the uploaded file to a temporary directory in Colab
    file_name = media_file.name
    file_path = f"/tmp/{file_name}"

    with open(file_path, "wb") as f:
        f.write(media_file.get_bytes())

    # Read the CSV file using pandas
    df = pd.read_csv(file_path)

    # Extract file details
    num_rows = df.shape[0]  # Number of rows
    num_columns = df.shape[1]  # Number of columns
    column_names = list(df.columns)  # Column names

    # Print file details in the Colab output for debugging
    print(f"File '{file_name}' details:")
    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {num_columns}")
    print(f"Column names: {column_names}")

    # Return the file details to the Anvil client
    return {
        "file_name": file_name,
        "num_rows": num_rows,
        "num_columns": num_columns,
        "column_names": column_names
    }

###Identifying the problem (classification/regression)

In [None]:
@anvil.server.callable
def process_selected_column(selected_column):
    global df
    global target
    target = df[selected_column] if df is not None else None  # Ensure target is not None
    global problem_type

    if df is None:
        return "No data loaded"

    if selected_column not in df.columns:
        return f"Column '{selected_column}' not found in the data"

    # Ensure target is not None
    if target is None:
        return f"Error: Column '{selected_column}' is empty or invalid."

    # Properties to determine if it's regression/classification
    column_type = target.dtype
    unique_values = target.nunique()

    # If the column is type string or boolean it is classification
    if pd.api.types.is_string_dtype(target) or pd.api.types.is_bool_dtype(target) or is_boolean_numerical(target):
        problem_type = "classification"

    # If the column is numerical
    elif pd.api.types.is_numeric_dtype(target) or pd.api.types.is_float_dtype(target):
      problem_type = "regression"

    else:
        problem_type = "Could not determine"

    print(column_type)
    print(f"Selected Column: {selected_column}")
    print(f"Problem Type: {problem_type}")

    # Return the results for Anvil
    return {
        'problem_type': problem_type,
    }

def is_boolean_numerical(column):
    unique_values = column.dropna().unique()
    return set(unique_values).issubset({0, 1})


###This cell retuns the number of null values in all columns

In [None]:
@anvil.server.callable
def null_count():
  global df
  if df is None:
    return "No data loaded"

  #if selected_column not in df.columns:
   # return f"Column '{selected_column}' not found in the data"

  null_count = df.isnull().sum().to_dict()


  print(f"Number of nulls in is: {null_count}")

  return {
        "null_count": null_count
    }

###This cell returns the number of blank values in all columns

In [None]:
@anvil.server.callable
def blank_count():
    global df
    if df is None:
        return {"error": "No data loaded"}

    # Count blanks for each column
    blank_count = (df == '').sum()  # This will return a Series
    blank_count_dict = blank_count.to_dict()  # Convert Series to dictionary

    print(f"Number of blanks in each column: {blank_count_dict}")

    return {
        "blank_count": blank_count_dict
    }

###This cell fills all null values

In [None]:
@anvil.server.callable
def fill_nulls():
    global df

    if df is None:
        return "No data loaded"

    # Iterate through each column in the DataFrame
    for column in df.columns:
        # If the column is numeric, fill nulls with the mean
        if pd.api.types.is_numeric_dtype(df[column]):
            avg_value = df[column].mean()
            df[column].fillna(avg_value, inplace=True)

        # If the column is string or boolean, fill nulls with the mode
        elif pd.api.types.is_string_dtype(df[column]) or pd.api.types.is_bool_dtype(df[column]):
            mode_value = df[column].mode()[0]
            df[column].fillna(mode_value, inplace=True)

    # Return to ensure the method executed
    return "success"

###This cell removes Duplicates

In [None]:
@anvil.server.callable
def remove_duplicates():
  global df

  if df is None:
    return "No data loaded"

  df.drop_duplicates(inplace=True)

  return "Removed Duplicates"

*This* cell returns the number of outliers in certain columns

> Add blockquote



In [None]:
@anvil.server.callable
def detect_outliers():
    global df
    if df is None:
        return "No data loaded"

    for column in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        outlier_count = outliers.shape[0]
        outlier_results= {
            "The total number of outliers is": outlier_count,
            #"outliers": outliers[column].tolist()  # List of outlier values
        }

    return { "outlier_results" :outlier_results}


This cell counts the duplicates in all columns

In [None]:
@anvil.server.callable
def detect_duplicates():
    global df  # Ensure you're using the global DataFrame

    if df is None:
        return "No data loaded"

    # Count the number of duplicates in the selected column
    duplicates = df.duplicated().sum()

    #total_duplicate_count = df[selected_column].value_counts()[df[selected_column].value_counts() > 1].sum()

    #print(f"Number of duplicate rows in column '{selected_column}': {duplicate_count}")
    #print(f"Total count of duplicates in column '{selected_column}': {total_duplicate_count}")

    return {"duplicates": duplicates}


###Checking for class imbalance if its a classification problem.

In [None]:
@anvil.server.callable
def balanced_classes(selected_column):
  global df
  global problem_type
  imbalance_threshold = 0.1

  if selected_column not in df.columns:
    return f"Column '{selected_column}' not found in the data"

  target = df[selected_column]

  #calculating the relative frequency out of 1.0
  class_distribution = target.value_counts(normalize=True)

  #Finding if theres class imbalance
  imbalanced_classes = class_distribution[class_distribution < imbalance_threshold]

  if imbalanced_classes.empty:
    return True #Data is balanced
  else:
    return False #Data is not balanced


In [None]:

@anvil.server.callable
def get_dataset():
  global df
  if df is None:
    return "No data loaded"
  return df

##Training

In [None]:

#pip install carbontracker

In [None]:
pip install codecarbon

Collecting codecarbon
  Downloading codecarbon-2.8.3-py3-none-any.whl.metadata (8.7 kB)
Collecting arrow (from codecarbon)
  Downloading arrow-1.3.0-py3-none-any.whl.metadata (7.5 kB)
Collecting fief-client[cli] (from codecarbon)
  Downloading fief_client-0.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting questionary (from codecarbon)
  Downloading questionary-2.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting rapidfuzz (from codecarbon)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting types-python-dateutil>=2.8.10 (from arrow->codecarbon)
  Downloading types_python_dateutil-2.9.0.20241206-py3-none-any.whl.metadata (2.1 kB)
Collecting httpx<0.28.0,>=0.21.3 (from fief-client[cli]->codecarbon)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jwcrypto<2.0.0,>=1.4 (from fief-client[cli]->codecarbon)
  Downloading jwcrypto-1.5.6-py3-none-any.whl.metadata (3.1 kB)
Collecting yaspin (from fief-clie

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from codecarbon import EmissionsTracker
from sklearn.preprocessing import LabelEncoder
#from carbontracker.tracker import CarbonTracker

##Classification

In [None]:
import time
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils import shuffle

# Use SGD-based SVM for large datasets
def build_train_and_track_emissions_svm(X_train, y_train, X_val, y_val, output_size, input_size):
    start_time = time.time()  # Start time tracking

    # Use SGDClassifier for better efficiency on large data
    model = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42, n_jobs=-1)

    # Mini-batch training for large datasets
    batch_size = 10000  # Adjust as needed
    X_train, y_train = shuffle(X_train, y_train, random_state=42)  # Shuffle data

    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i + batch_size]
        y_batch = y_train[i:i + batch_size]
        model.partial_fit(X_batch, y_batch, classes=np.unique(y_train))

    # Stop time tracking
    end_time = time.time()
    total_time = end_time - start_time

    # Predictions on validation data
    y_pred = model.predict(X_val)

    # Metrics
    accuracy = accuracy_score(y_val, y_pred)
    f_score = f1_score(y_val, y_pred, average='weighted')

    # Carbon Emissions (if available)
    try:
        from codecarbon import EmissionsTracker
        tracker = EmissionsTracker()
        tracker.start()
        emissions = tracker.stop() * 1000  # Convert to milligrams
    except ImportError:
        emissions = "Tracking not enabled"

    # Results
    result_output = (
        f"Estimated CO₂ emissions: {emissions} mg\n"
        f"Training Time: {total_time:.2f} seconds\n"
        f"Final Accuracy: {accuracy:.4f}\n"
        f"F-score: {f_score:.4f}"
    )

    return result_output


In [None]:


import time
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score
from codecarbon import EmissionsTracker


# DECISION TREE

def build_train_and_track_emissions_dt(X_train, y_train, X_val, y_val, input_size, output_size):
    # Initialize the Decision Tree model with optimized parameters
    model = DecisionTreeClassifier(
        criterion="gini",  # "entropy" can also be used
        max_depth=None,  # Adjust for smaller datasets if needed
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    )

    # Initialize CarbonTracker
    tracker = EmissionsTracker()
    tracker.start()

    # Start tracking time
    start_time = time.time()

    # Train the model
    model.fit(X_train, y_train)

    # Stop tracking emissions
    emissions = tracker.stop()
    emissions = emissions * 1000  # Convert kg to grams

    # Stop tracking time
    end_time = time.time()
    total_time = end_time - start_time

    # Get predictions
    y_pred = model.predict(X_val)

    # Compute accuracy and F1-score
    final_accuracy = accuracy_score(y_val, y_pred)
    f_score = f1_score(y_val, y_pred, average="weighted")

    # Create a result string
    result_output = (
        f"Estimated CO₂ emissions: {emissions:.6f} g\n"
        f"Time Taken: {total_time:.2f} seconds\n"
        f"Final Accuracy: {final_accuracy:.4f}\n"
        f"F-score: {f_score:.4f}"
    )

    return result_output


In [None]:
pip install tensorflow.keras

Collecting tensorflow.keras
  Downloading tensorflow_keras-0.1-py3-none-any.whl.metadata (63 bytes)
Downloading tensorflow_keras-0.1-py3-none-any.whl (5.2 kB)
Installing collected packages: tensorflow.keras
Successfully installed tensorflow.keras-0.1


In [None]:
import io
import sys
import time
import numpy as np
from sklearn.metrics import f1_score
from tensorflow import keras
from tensorflow.keras import layers
from codecarbon import EmissionsTracker

def build_train_and_track_emissions_nnn(X_train, y_train, X_val, y_val, input_size, output_size):
    model = keras.Sequential([
        layers.Dense(4, activation='relu', input_shape=(input_size,)),
        layers.Dense(4, activation='relu'),
        layers.Dense(1, activation='sigmoid'),
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['binary_accuracy'],
    )

    early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
    )

    # Initialize CarbonTracker
    tracker = EmissionsTracker()

    # Start tracking carbon emissions
    tracker.start()

    # Start tracking time
    start_time = time.time()

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=512,
        epochs=1000,
        callbacks=[early_stopping],
        verbose=0,  # Hide output
    )

    # Stop tracking emissions
    emissions = tracker.stop() * 1000  # Convert to grams

    # Stop tracking time
    total_time = time.time() - start_time

    # Number of epochs used before early stopping
    num_epochs = len(history.history['loss'])

    # Get final accuracy
    final_accuracy = history.history['binary_accuracy'][-1]

    # Get F-score (using validation set predictions)
    y_pred_prob = model.predict(X_val)
    y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to class labels
    f_score = f1_score(y_val, y_pred, average='weighted')

    # Create a result string
    result_output = (
        f"Estimated CO₂ emissions: {emissions:.6f} g\n"
        f"Number of Epochs: {num_epochs}\n"
        f"Time Taken: {total_time:.2f} seconds\n"
        f"Final Accuracy: {final_accuracy:.4f}\n"
        f"F-score: {f_score:.4f}"
    )
    return result_output


In [None]:
import io
import sys
import time
from sklearn.metrics import f1_score
# modified one (Acccuracy percentage)
def build_train_and_track_emissions_nn(X_train, y_train, X_val, y_val, input_size, output_size):
    hidden_layer_size = 50  # Number of neurons in the hidden layer

    # Define the model.  modified one
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(hidden_layer_size, activation='relu', input_shape=(input_size,)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),

        tf.keras.layers.Dense(output_size, activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    # Early stopping to prevent overfitting
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=25, restore_best_weights=True)

    # Dynamic batch size: Smaller for small datasets, larger for big datasets
    batch_size = min(64, max(16, len(X_train) // 50))

    # Initialize CarbonTracker
    tracker = EmissionsTracker()

    # Start tracking cabron emissions
    tracker.start()

    # Start tracking time
    start_time = time.time()

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=1000,
        batch_size=batch_size,
        callbacks=[early_stopping],
        verbose=2
    )
    #tracker.epoch_end()  # Stop tracking
    emissions = tracker.stop()
    emissions = emissions * 1000

    # Stop tracking time
    end_time = time.time()
    total_time = end_time - start_time  # Time taken in seconds

    # Number of epochs used before early stopping
    num_epochs = len(history.history['loss'])

    # Get final accuracy
    final_accuracy = history.history['accuracy'][-1]

    # Get F-score (using validation set predictions)
    y_pred_prob = model.predict(X_val)
    y_pred = np.argmax(y_pred_prob, axis=1)  # Convert probabilities to class labels
    f_score = f1_score(y_val, y_pred, average='weighted')


    # Generate training history graph
    fig, ax = plt.subplots(1, 2, figsize=(12, 5))

    # Accuracy plot
    ax[0].plot(history.history['accuracy'], label="Train Accuracy")
    ax[0].plot(history.history['val_accuracy'], label="Validation Accuracy")
    ax[0].set_title("Model Accuracy")
    ax[0].set_xlabel("Epoch")
    ax[0].set_ylabel("Accuracy")
    ax[0].legend()

    # Loss plot
    ax[1].plot(history.history['loss'], label="Train Loss")
    ax[1].plot(history.history['val_loss'], label="Validation Loss")
    ax[1].set_title("Model Loss")
    ax[1].set_xlabel("Epoch")
    ax[1].set_ylabel("Loss")
    ax[1].legend()

    plt.tight_layout()

    # Save the plot to a buffer
    buffer = io.BytesIO()
    plt.savefig(buffer, format="png")
    buffer.seek(0)

    # Convert buffer to Anvil Media
    graph_media = anvil.BlobMedia("image/png", buffer.getvalue())

    # Create a result string
    result_output = (
        f"Estimated CO₂ emissions: {emissions:.6f} g\n"
        f"Number of Epochs: {num_epochs}\n"
        f"Time Taken: {total_time:.2f} seconds\n"
        f"Final Accuracy: {final_accuracy:.4f}\n"
        f"F-score: {f_score:.4f}"
    )

    return result_output, graph_media

In [None]:
def split_data(df, target_column):
    # Separate features and target
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values

    # Split into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def encode_categorical_columns(df):
  for column in df.columns:
        if df[column].dtype == 'object' or pd.api.types.is_categorical_dtype(df[column]):
            print(f"Encoding column: {column}")
            label_encoder = LabelEncoder()
            df[column] = label_encoder.fit_transform(df[column])
  return df


GRADIENT BOOSTING

In [None]:

import time
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score
from codecarbon import EmissionsTracker

# GRADIENT BOOSTING CLASSIFIER

def build_train_and_track_emissions_gbc(X_train, y_train, X_val, y_val):
    # Initialize the Gradient Boosting model with optimized parameters
    model = GradientBoostingClassifier(
        loss='log_loss',  # 'exponential' is another option
        learning_rate=0.1,
        n_estimators=100,
        subsample=1.0,
        criterion='friedman_mse',
        min_samples_split=2,
        min_samples_leaf=1,
        max_depth=3,
        random_state=42
    )

    # Initialize CarbonTracker
    tracker = EmissionsTracker()
    tracker.start()

    # Start tracking time
    start_time = time.time()

    # Train the model
    model.fit(X_train, y_train)

    # Stop tracking emissions
    emissions = tracker.stop()
    emissions = emissions * 1000  # Convert kg to grams

    # Stop tracking time
    end_time = time.time()
    total_time = end_time - start_time

    # Get predictions
    y_pred = model.predict(X_val)

    # Compute accuracy and F1-score
    final_accuracy = accuracy_score(y_val, y_pred)
    f_score = f1_score(y_val, y_pred, average="weighted")

    # Create a result string
    result_output = (
        f"Estimated CO₂ emissions: {emissions:.6f} g\n"
        f"Time Taken: {total_time:.2f} seconds\n"
        f"Final Accuracy: {final_accuracy:.4f}\n"
        f"F-score: {f_score:.4f}"
    )

    return result_output

*Random Forest*

In [None]:
import time
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from codecarbon import EmissionsTracker

def build_train_and_track_emissions_rf(X_train, y_train, X_val, y_val):
    # Initialize the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees in the forest

    # Initialize CarbonTracker
    tracker = EmissionsTracker()

    # Start tracking carbon emissions
    tracker.start()

    # Start tracking time
    start_time = time.time()

    # Fit the model
    model.fit(X_train, y_train)

    # Stop tracking emissions
    emissions = tracker.stop() * 1000  # Convert to grams

    # Stop tracking time
    total_time = time.time() - start_time

    # Predict on the validation set
    y_pred = model.predict(X_val)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)

    # Calculate F-score
    f_score = f1_score(y_val, y_pred, average='weighted')

    # Create a result string
    result_output = (
        f"Estimated CO₂ emissions: {emissions:.6f} g\n"
        f"Time Taken: {total_time:.2f} seconds\n"
        f"Final Accuracy: {accuracy:.4f}\n"
        f"F-score: {f_score:.4f}"
    )
    return result_output

In [None]:
@anvil.server.callable
def run_nn_for_classification_with_emissions(selected_column):

    selected_column_index = df.columns.get_loc(selected_column)
    target = df.columns[selected_column_index]


    # Split data
    encoded_df = encode_categorical_columns(df)

    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, target)

    # Train neural network and track emissions
    input_size = X_train.shape[1]
    output_size = df[target].nunique()

    result_output, graph_media = build_train_and_track_emissions_nn(X_train, y_train, X_val, y_val, input_size, output_size)


    return result_output, graph_media

@anvil.server.callable
def run_dt_for_classification_with_emissions(selected_column):

    selected_column_index = df.columns.get_loc(selected_column)
    target = df.columns[selected_column_index]


    # Split data
    encoded_df = encode_categorical_columns(df)

    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, target)

    # Train neural network and track emissions
    input_size = X_train.shape[1]
    output_size = df[target].nunique()

    tracker = build_train_and_track_emissions_dt(X_train, y_train, X_val, y_val, input_size, output_size)


    return tracker


@anvil.server.callable
def run_svm_for_classification_with_emissions(selected_column):

    selected_column_index = df.columns.get_loc(selected_column)
    target = df.columns[selected_column_index]


    # Split data
    encoded_df = encode_categorical_columns(df)

    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, target)

    # Train neural network and track emissions
    input_size = X_train.shape[1]
    output_size = df[target].nunique()

    tracker = build_train_and_track_emissions_svm(X_train, y_train, X_val, y_val, input_size, output_size)


    return tracker

@anvil.server.callable
def run_gbc_for_classification_with_emissions(selected_column):
    selected_column_index = df.columns.get_loc(selected_column)
    target = df.columns[selected_column_index]

    # Encode categorical data
    encoded_df = encode_categorical_columns(df)

    # Split data
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, target)

    # Get input/output sizes
    input_size = X_train.shape[1]
    output_size = df[target].nunique()

    # Train Gradient Boosting Classifier and track emissions
    tracker = build_train_and_track_emissions_gbc(X_train, y_train, X_val, y_val)

    return tracker


@anvil.server.callable
def run_rf_for_classification_with_emissions(selected_column):

    selected_column_index = df.columns.get_loc(selected_column)
    target = df.columns[selected_column_index]


    # Split data
    encoded_df = encode_categorical_columns(df)

    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, target)

    # Train neural network and track emissions
    input_size = X_train.shape[1]
    output_size = df[target].nunique()

    tracker = build_train_and_track_emissions_rf(X_train, y_train, X_val, y_val)


    return tracker

#Regression

In [None]:

import io
import sys
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from carbontracker.tracker import CarbonTracker
# Modified one (MSE)


def build_train_and_track_emissions1(X_train, y_train, X_val, y_val, input_size):
    hidden_layer_size = 50  # Number of neurons in the hidden layer

    # Define the model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(hidden_layer_size, activation='relu', input_shape=(input_size,)),
        tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
        tf.keras.layers.Dense(1)  # Single output for regression
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])  # Use MSE for regression

    # Early stopping to prevent overfitting
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

    # Redirect CarbonTracker output
    buffer = io.StringIO()
    sys.stdout = buffer

    # Initialize CarbonTracker
    tracker = CarbonTracker(epochs=100)

    # Train the model with CarbonTracker
    tracker.epoch_start()  # Start tracking
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=2
    )
    tracker.epoch_end()  # Stop tracking

    # Reset stdout
    sys.stdout = sys.__stdout__

    # Extract relevant lines from the tracker output
    output = buffer.getvalue()
    buffer.close()

    # Parse the output for the desired section
    relevant_lines = []
    capturing = False
    for line in output.split("\n"):
        if "Actual consumption" in line or "Predicted consumption" in line:
            capturing = True
        if capturing:
            relevant_lines.append(line)
            if "travelled by car" in line:  # End after the car equivalence line
                break

    # Get the final MSE from the training history
    final_mse = history.history['mse'][-1]  # Get the last MSE value from the training history

    # Create a normal output string
    result_output = "\n".join(relevant_lines) + f"\nFinal Mean Squared Error (MSE): {final_mse}"

    return result_output

def split_data1(df, target_column):
    # Separate features and target
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values

    # Split into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test

def encode_categorical_columns1(df):
    for column in df.columns:
        if df[column].dtype == 'object' or pd.api.types.is_categorical_dtype(df[column]):
            print(f"Encoding column: {column}")
            label_encoder = LabelEncoder()
            df[column] = label_encoder.fit_transform(df[column])
    return df

@anvil.server.callable
def run_nn_for_regression_with_emissions1(selected_column):
    global df
    selected_column_index = df.columns.get_loc(selected_column)
    target = df.columns[selected_column_index]


    # Split data
    encoded_df = encode_categorical_columns1(df)

    X_train, X_val, X_test, y_train, y_val, y_test = split_data1(encoded_df, target)

    # Train neural network and track emissions
    input_size = X_train.shape[1]
    tracker = build_train_and_track_emissions1(X_train, y_train, X_val, y_val, input_size)

    # Return results
    return tracker


ModuleNotFoundError: No module named 'carbontracker'