<a href="https://colab.research.google.com/github/PreyPython123/Master-V24-Semiveiledet-Regresjon/blob/Collagen-Pradeep/Bioco_Collagen_Klassiske_Superveiledet_Regresjonsmetoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lasting av nødvendige bibliotek og pakker

In [1]:
!pip install optuna



Importering av nødvendig bibliotek og pakker

In [2]:
import pandas as pd
import numpy as np
import copy
import math

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

import optuna
from optuna.visualization import plot_optimization_history

Importering av relevant data

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
%cd /content/drive/MyDrive/MasterV24/GitHub/
%ls
#from oppdeling_trening_testsett import oppdeling_enzymtype

/content/drive/MyDrive/MasterV24/GitHub
'Bioco_BrixAdjusted: Semiveiledet Regresjonsmetoder.ipynb'
'Bioco_BrixAdjusted: Superveiledet Regresjonsmetoder.ipynb'
'Bioco_Collagen: Klassiske Superveiledet Regresjonsmetoder.ipynb'
'Bioco_Collagen: Semiveiledet Regresjonsmetoder.ipynb'
'Bioco: Databehandling av ekstremverdier.ipynb'
'Bioco: Databehandling av manglende verdier.ipynb'
'Bioco: Dataprosessering, Dataforståelse og Dataundersøkelse.ipynb'
'Bioco_Mw: Semiveiledet Regresjonsmetoder.ipynb'
'Bioco_Mw: Superveiledet Regresjonsmetoder.ipynb'
'Bioco_Smallmolecules: Semiveiledet Regresjonsmetoder.ipynb'
'Bioco_Smallmolecules: Superveiledet Regresjonsmetoder.ipynb'
'Bioco: Variabelundersøkelse.ipynb'
 Oppdeling_trening_testsett.ipynb
 [0m[01;34m__pycache__[0m/


In [5]:
# Velger første kolonne med dato og tid som index
collagen_data = pd.read_csv('/content/drive/MyDrive/MasterV24/Bioco_data/collagen_data.csv',
                            header=0,
                            sep=',',
                            index_col=0)

# Formatterer index til riktig format og datatype
collagen_data.index = pd.to_datetime(collagen_data.index,
                                     format='%Y-%m-%d %H:%M:%S')

Beskrivelse av datasett

In [6]:
collagen_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 29136 entries, 2022-10-31 17:37:00 to 2023-06-14 01:06:00
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   EnzymeType_A1  29136 non-null  int64  
 1   EnzymeType_A2  29136 non-null  int64  
 2   EnzymeType_B   29136 non-null  int64  
 3   EnzymeType_C   29136 non-null  int64  
 4   EnzymeType_D   29136 non-null  int64  
 5   EnzymeType_E   29136 non-null  int64  
 6   RawMatFlow     29136 non-null  float64
 7   NIRfat         29136 non-null  float64
 8   NIRash         29136 non-null  float64
 9   NIRwater       29136 non-null  float64
 10  TT07           29136 non-null  float64
 11  TT08           29136 non-null  float64
 12  PT03           29136 non-null  float64
 13  TT20           29136 non-null  float64
 14  TT09           29136 non-null  float64
 15  TT12           29136 non-null  float64
 16  Collagen       89 non-null     float64
dtypes: float64(11),

Tilfeldighetsfrø

In [7]:
#Tilfeldighetsfrø
random_seed = 123

# Oppdeling av datasett

## Alternativ 1: Fordelt på enzymtyper

In [8]:
 def trening_testsett_oppdeling_enzym(df, test_size=0.2, random_seed=123):

  # Responsvariabel
  responsvariabel = df.columns[-1]

  # Deler opp markert og umarkert del av datasettet
  data_markert = df.dropna(subset=[responsvariabel])
  data_umarkert = df[df[responsvariabel].isna()]

  # Legger til kategorisk variabel for enzymtype til fordeling av datsettet
  data_markert['EnzymType'] = data_markert.filter(like='EnzymeType_').idxmax(axis=1).str.split('_').str[1].astype('category')

  # Splitter datasett i dataasett og kategorisk enzymtype
  enzymtyper = data_markert['EnzymType']
  data_markert.drop(columns=['EnzymType'], inplace=True)

  # Splitter trening og testdata etter enzymtype
  trening_markert, test_markert, _, _ = train_test_split(data_markert,
                                                         enzymtyper,
                                                         test_size=test_size,
                                                         stratify=enzymtyper,
                                                         random_state=random_seed)

  # Definerer trening og testsett for data

  felles_indeks = df.index.intersection(test_markert.index)
  treningsdata = df.drop(felles_indeks)
  trening_umarkert = treningsdata[treningsdata[responsvariabel].isna()]

  return trening_markert, test_markert, trening_umarkert, treningsdata

## Alternativ 2: Fordelt på dag og kontinuitet

In [9]:
def trening_testsett_oppdeling_dag(df, andel=0.2):

    # Responsvariabel
    responsvariabel = df.columns[-1]

    # Lager nytt trening og testsett
    data_markert = df.dropna(subset=[responsvariabel])

    trening_markert = copy.deepcopy(data_markert)
    test_markert = pd.DataFrame(columns=list(data_markert.columns))

    # Går gjennom hver dag, hver uke og samler første andel av rader til testsett
    for uke in data_markert.index.isocalendar().week.unique():
        for dag in data_markert.index.isocalendar().day.unique():
            data = data_markert[(data_markert.index.isocalendar().week == uke) &
                      (data_markert.index.isocalendar().day == dag)]
            test_rader = data.shape[0] * andel
            test_data_dag = data.head(round(test_rader))

            test_markert = pd.concat([test_markert, test_data_dag])
            trening_markert.drop(test_data_dag.index, inplace=True)

    felles_indeks = df.index.intersection(test_markert.index)
    treningsdata = df.drop(felles_indeks)
    trening_umarkert = treningsdata[treningsdata[responsvariabel].isna()]

    return trening_markert, test_markert, trening_umarkert, treningsdata

# NIR målinger inkludert

## Oppdeling av datasett for trening og testing

### Alternativ 1: Fordelt på Enzymtype

In [10]:
collagen_trening, collagen_test, _, _ = trening_testsett_oppdeling_enzym(collagen_data)

X_trening = collagen_trening.iloc[:, :-1]
X_test = collagen_test.iloc[:, :-1]
y_trening = collagen_trening.iloc[:, -1]
y_test = collagen_test.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_markert['EnzymType'] = data_markert.filter(like='EnzymeType_').idxmax(axis=1).str.split('_').str[1].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_markert.drop(columns=['EnzymType'], inplace=True)


**RandomForestRegressor**

Hyperparamter optimalisering med Optuna

In [11]:
def objective(trial):
  parametere = {
      'n_estimators': trial.suggest_int('n_estimators', 100, 200),
      'max_depth': trial.suggest_int('max_depth', 1, 10)
  }
  rf_pipeline = Pipeline([
      ('skalerer', StandardScaler()),
      ('modell', RandomForestRegressor(**parametere, random_state=random_seed))
  ])

  rf_pipeline.fit(X_trening, y_trening)
  y_test_prediksjon = rf_pipeline.predict(X_test)
  mse_test = mean_squared_error(y_test, y_test_prediksjon)
  return mse_test

if __name__ == "__main__":
  study = optuna.create_study(direction='minimize')
  study.optimize(objective, n_trials=100)

plot_optimization_history(study)

[I 2024-02-11 14:00:31,482] A new study created in memory with name: no-name-a9dd621a-a199-4617-bd1e-6b8548b8f3a6
[I 2024-02-11 14:00:31,706] Trial 0 finished with value: 24.26655195116126 and parameters: {'n_estimators': 171, 'max_depth': 1}. Best is trial 0 with value: 24.26655195116126.
[I 2024-02-11 14:00:32,065] Trial 1 finished with value: 16.750394386485723 and parameters: {'n_estimators': 153, 'max_depth': 5}. Best is trial 1 with value: 16.750394386485723.
[I 2024-02-11 14:00:32,292] Trial 2 finished with value: 16.562077536865033 and parameters: {'n_estimators': 126, 'max_depth': 6}. Best is trial 2 with value: 16.562077536865033.
[I 2024-02-11 14:00:32,564] Trial 3 finished with value: 16.663874538298785 and parameters: {'n_estimators': 161, 'max_depth': 7}. Best is trial 2 with value: 16.562077536865033.
[I 2024-02-11 14:00:32,836] Trial 4 finished with value: 16.88281740713207 and parameters: {'n_estimators': 181, 'max_depth': 5}. Best is trial 2 with value: 16.56207753686

Evauleringer av beste modell

In [12]:
def detailed_objective(trial):
  parametere = {
      'n_estimators': trial.suggest_int('n_estimators', 100, 200),
      'max_depth': trial.suggest_int('max_depth', 1, 10)
  }
  rf_pipeline = Pipeline([
      ('skalerer', StandardScaler()),
      ('modell', RandomForestRegressor(**parametere, random_state=random_seed))
  ])

  rf_pipeline.fit(X_trening, y_trening)
  y_test_prediksjon = rf_pipeline.predict(X_test)
  y_trening_prediksjon = rf_pipeline.predict(X_trening)

  mse_test = mean_squared_error(y_test, y_test_prediksjon)
  r2_test = r2_score(y_test, y_test_prediksjon)

  mse_trening = mean_squared_error(y_trening, y_trening_prediksjon)
  r2_trening = r2_score(y_trening, y_trening_prediksjon)

  return mse_test, r2_test, mse_trening, r2_trening

mse_test_resultat = detailed_objective(study.best_trial)[0]
r2_test_resultat = detailed_objective(study.best_trial)[1]
mse_trening_resultat = detailed_objective(study.best_trial)[2]
r2_trening_resultat = detailed_objective(study.best_trial)[3]

print("Beste hyperparametere for RandomForestRegressor: {}".format(study.best_params))
print("Test: MSE: {}, og R^2: {}".format(mse_test_resultat, r2_test_resultat))
print("Trening: MSE: {}, og R^2: {}".format(mse_trening_resultat, r2_trening_resultat))

Beste hyperparametere for RandomForestRegressor: {'n_estimators': 125, 'max_depth': 8}
Test: MSE: 16.1279219616204, og R^2: 0.5359083947487182
Trening: MSE: 1.6344692049527287, og R^2: 0.9354977759025687


### Alternativ 2: Fordelt på dag og kontinuitet

In [16]:
collagen_trening, collagen_test, _, _ = trening_testsett_oppdeling_dag(collagen_data)

X_trening = collagen_trening.iloc[:, :-1]
X_test = collagen_test.iloc[:, :-1]
y_trening = collagen_trening.iloc[:, -1]
y_test = collagen_test.iloc[:, -1]

**RandomForestRegressor**

Hyperparameteroptimalisering med Optuna

In [17]:
def objective(trial):
  parametere = {
      'n_estimators': trial.suggest_int('n_estimators', 100, 200),
      'max_depth': trial.suggest_int('max_depth', 1, 10)
  }
  rf_pipeline = Pipeline([
      ('skalerer', StandardScaler()),
      ('modell', RandomForestRegressor(**parametere, random_state=random_seed))
  ])

  rf_pipeline.fit(X_trening, y_trening)
  y_test_prediksjon = rf_pipeline.predict(X_test)
  mse_test = mean_squared_error(y_test, y_test_prediksjon)
  return mse_test

if __name__ == "__main__":
  study = optuna.create_study(direction='minimize')
  study.optimize(objective, n_trials=100)

plot_optimization_history(study)

[I 2024-02-11 14:03:45,592] A new study created in memory with name: no-name-480596f4-e82f-4360-b2cd-24c612a62412
[I 2024-02-11 14:03:46,109] Trial 0 finished with value: 8.51127804230406 and parameters: {'n_estimators': 105, 'max_depth': 5}. Best is trial 0 with value: 8.51127804230406.
[I 2024-02-11 14:03:46,631] Trial 1 finished with value: 8.217833943328044 and parameters: {'n_estimators': 196, 'max_depth': 9}. Best is trial 1 with value: 8.217833943328044.
[I 2024-02-11 14:03:46,933] Trial 2 finished with value: 13.470954374386224 and parameters: {'n_estimators': 136, 'max_depth': 1}. Best is trial 1 with value: 8.217833943328044.
[I 2024-02-11 14:03:47,203] Trial 3 finished with value: 8.389732034186938 and parameters: {'n_estimators': 107, 'max_depth': 5}. Best is trial 1 with value: 8.217833943328044.
[I 2024-02-11 14:03:47,517] Trial 4 finished with value: 8.323521681386579 and parameters: {'n_estimators': 192, 'max_depth': 8}. Best is trial 1 with value: 8.217833943328044.
[I

In [18]:
def detailed_objective(trial):
  parametere = {
      'n_estimators': trial.suggest_int('n_estimators', 100, 200),
      'max_depth': trial.suggest_int('max_depth', 1, 10)
  }
  rf_pipeline = Pipeline([
      ('skalerer', StandardScaler()),
      ('modell', RandomForestRegressor(**parametere, random_state=random_seed))
  ])

  rf_pipeline.fit(X_trening, y_trening)
  y_test_prediksjon = rf_pipeline.predict(X_test)
  y_trening_prediksjon = rf_pipeline.predict(X_trening)

  mse_test = mean_squared_error(y_test, y_test_prediksjon)
  r2_test = r2_score(y_test, y_test_prediksjon)

  mse_trening = mean_squared_error(y_trening, y_trening_prediksjon)
  r2_trening = r2_score(y_trening, y_trening_prediksjon)

  return mse_test, r2_test, mse_trening, r2_trening

mse_test_resultat = detailed_objective(study.best_trial)[0]
r2_test_resultat = detailed_objective(study.best_trial)[1]
mse_trening_resultat = detailed_objective(study.best_trial)[2]
r2_trening_resultat = detailed_objective(study.best_trial)[3]

print("Beste hyperparametere for RandomForestRegressor: {}".format(study.best_params))
print("Test: MSE: {}, og R^2: {}".format(mse_test_resultat, r2_test_resultat))
print("Trening: MSE: {}, og R^2: {}".format(mse_trening_resultat, r2_trening_resultat))

Beste hyperparametere for RandomForestRegressor: {'n_estimators': 197, 'max_depth': 7}
Test: MSE: 8.079743300024674, og R^2: 0.6577768822921066
Trening: MSE: 1.817864367572221, og R^2: 0.9351151886016815


# NIR målinger ekskludert