<a href="https://colab.research.google.com/github/PreyPython123/Master-V24-Semiveiledet-Regresjon/blob/Collagen-Pradeep/Bioco_Collagen_Semiveiledet_Regresjonsmetoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lasting av nødvendige bibliotek og pakker

In [1]:
!pip install LAMDA-SSL
!pip install optuna



Importering av nødvendige bibliotek og pakker

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from LAMDA_SSL.Algorithm.Regression.CoReg import CoReg

from sklearn.metrics import mean_squared_error, r2_score

import optuna
from optuna.visualization import plot_optimization_history

Importering av relevant data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Velger første kolonne med dato og tid som index
collagen_data = pd.read_csv('/content/drive/MyDrive/MasterV24/Bioco_data/collagen_data.csv',
                            header=0,
                            sep=',',
                            index_col=0)

# Formatterer index til riktig format og datatype
collagen_data.index = pd.to_datetime(collagen_data.index,
                                     format='%Y-%m-%d %H:%M:%S')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Oppdeling av datasett til trening og testsett

In [4]:
#Tilfeldighetsfrø
random_seed = 123

# Deler opp markert del av datasettet
collagen_markert = collagen_data.dropna(subset='Collagen')
collagen_umarkert = collagen_data[collagen_data['Collagen'].isna()]

# Legger til kategorisk variabel for enzymtype til fordeling av datsettet
collagen_markert['EnzymType'] = collagen_markert.filter(like='EnzymeType_').idxmax(axis=1).str.split('_').str[1].astype('category')

# Splitter datasett i collagendataasett og kategorisk enzymtype
collagen_enzymetypes = collagen_markert['EnzymType']
collagen_markert.drop(columns=['EnzymType'], inplace=True)

# Splitter trening og testdata etter enzymtype
collagen_trening_markert, collagen_test_markert, _, _ = train_test_split(collagen_markert,
                                                                         collagen_enzymetypes,
                                                                         test_size = 0.20,
                                                                         stratify = collagen_enzymetypes,
                                                                         random_state = random_seed)

# Deler opp datasettene til prediktorer og respons, for trening og testsett
X_trening_markert = collagen_trening_markert.iloc[:, :-1]
X_trening_umarkert = collagen_umarkert.iloc[:, :-1]
X_test = collagen_test_markert.iloc[:, :-1]
y_trening_markert = collagen_trening_markert.iloc[:, -1]
y_test = collagen_test_markert.iloc[:, -1]

# Lager et fullstendig treningssett, både markert og umarkert del av datasettet
collagen_prediktor_data = collagen_data.iloc[:, :-1]
felles_indeks = collagen_prediktor_data.index.intersection(X_test.index)
X_trening = collagen_prediktor_data.drop(felles_indeks)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  collagen_markert['EnzymType'] = collagen_markert.filter(like='EnzymeType_').idxmax(axis=1).str.split('_').str[1].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  collagen_markert.drop(columns=['EnzymType'], inplace=True)


# CoReg

Skalerer data etter fullstendig treningsdata

In [5]:
skalerer = StandardScaler()
skalerer.fit(X_trening)
X_trening_markert_transformert = skalerer.transform(X_trening_markert)
X_trening_umarkert_transformert = skalerer.transform(X_trening_umarkert)
X_test_transformert = skalerer.transform(X_test)

Hyperparamter optimalisering med Optuna

In [8]:
def objective(trial):
  parametere = {
      'k1': trial.suggest_int('k1', 1, 10),
      'k2': trial.suggest_int('k2', 1, 10),
      'p1': trial.suggest_int('p1', 1, 10),
      'p2': trial.suggest_int('p2', 1, 10)
  }

  coreg = CoReg()

  coreg.fit(X_trening_markert_transformert, y_trening_markert.values, X_trening_umarkert_transformert)
  y_test_prediksjon = coreg.predict(X_test_transformert)
  mse_test = mean_squared_error(y_test, y_test_prediksjon)
  return mse_test

if __name__ == "__main__":
  study = optuna.create_study(direction='minimize')
  study.optimize(objective, n_trials=10)

plot_optimization_history(study)

[I 2024-02-08 11:33:20,911] A new study created in memory with name: no-name-4990a783-3ca9-4b97-a2c8-1c138fd1993a
[I 2024-02-08 11:35:32,833] Trial 0 finished with value: 15.079050680251624 and parameters: {'k1': 3, 'k2': 4, 'p1': 5, 'p2': 2}. Best is trial 0 with value: 15.079050680251624.
[I 2024-02-08 11:37:44,400] Trial 1 finished with value: 14.988795133399359 and parameters: {'k1': 6, 'k2': 2, 'p1': 2, 'p2': 1}. Best is trial 1 with value: 14.988795133399359.
[I 2024-02-08 11:39:56,028] Trial 2 finished with value: 13.4018063809061 and parameters: {'k1': 7, 'k2': 10, 'p1': 6, 'p2': 4}. Best is trial 2 with value: 13.4018063809061.
[I 2024-02-08 11:42:07,705] Trial 3 finished with value: 15.748723498239217 and parameters: {'k1': 4, 'k2': 6, 'p1': 10, 'p2': 3}. Best is trial 2 with value: 13.4018063809061.
[I 2024-02-08 11:44:19,445] Trial 4 finished with value: 13.582745490131718 and parameters: {'k1': 3, 'k2': 1, 'p1': 8, 'p2': 4}. Best is trial 2 with value: 13.4018063809061.
[I

Evauleringer av beste modell

In [10]:
def detailed_objective(trial):
  parametere = {
      'k1': trial.suggest_int('k1', 1, 10),
      'k2': trial.suggest_int('k2', 1, 10),
      'p1': trial.suggest_int('p1', 1, 10),
      'p2': trial.suggest_int('p2', 1, 10)
  }

  coreg = CoReg()

  coreg.fit(X_trening_markert_transformert, y_trening_markert.values, X_trening_umarkert_transformert)
  y_test_prediksjon = coreg.predict(X_test_transformert)
  y_trening_prediksjon = coreg.predict(X_trening_markert_transformert)

  mse_test = mean_squared_error(y_test, y_test_prediksjon)
  r2_test = r2_score(y_test, y_test_prediksjon)

  mse_trening = mean_squared_error(y_trening_markert, y_trening_prediksjon)
  r2_trening = r2_score(y_trening_markert, y_trening_prediksjon)

  return mse_test, r2_test, mse_trening, r2_trening

mse_test_resultat = detailed_objective(study.best_trial)[0]
r2_test_resultat = detailed_objective(study.best_trial)[1]
mse_trening_resultat = detailed_objective(study.best_trial)[2]
r2_trening_resultat = detailed_objective(study.best_trial)[3]

print("Beste hyperparametere for CoReg: {}".format(study.best_params))
print("Test: MSE: {}, og R^2: {}".format(mse_test_resultat, r2_test_resultat))
print("Trening: MSE: {}, og R^2: {}".format(mse_trening_resultat, r2_trening_resultat))

Beste hyperparametere for CoReg: {'k1': 6, 'k2': 1, 'p1': 9, 'p2': 7}
Test: MSE: 14.99615471061954, og R^2: 0.5752613091557486
Trening: MSE: 1.6514561031255204, og R^2: 0.9305162638283433
