## Importuotos bibliotekos

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict, deque
import random
from dataclasses import dataclass
from typing import Optional, Tuple, List, Dict, Any
from datetime import datetime
import copy
import itertools
import warnings

## Parametrai

### α (Alpha) - Mokymosi greitis

**Formulė:**
$$Q_{naujas} = Q_{senas} + \alpha \cdot (target - Q_{senas})$$

**Ką reiškia α:**
- **α = 0**: Nieko nesimoko (ignoruoja naują informaciją)
- **α = 0.5**: Greitas mokymasis (50% naujo + 50% seno)
- **α = 1.0**: Mokosi tik iš naujos informacijos (100% naujo)

---

### γ (Gamma) - Nuolaidos faktorius

**Formulė:**
$$target = r + \gamma \cdot Q(s', a')$$

**Ką reiškia γ:**
- **γ = 0**: Žiūri tik į dabartinį atlygį (trumparegis)
- **γ = 0.9**: Ateitis beveik tokia pat svarbi kaip dabartis
- **γ = 1.0**: Ateitis lygiai tokia pat svarbi

---

### ε (Epsilon) - Tyrinėjimų tikimybė

**Formulė (Epsilon-Greedy):**
$$a = \begin{cases} random(A) & \text{su tikimybe } \epsilon \\ \arg\max_a Q(s,a) & \text{su tikimybe } 1-\epsilon \end{cases}$$

**Ką reiškia ε:**
- **ε = 0.05**: 5% atsitiktinis, 95% geriausias
- **ε = 0.15**: 15% atsitiktinis, 85% geriausias
- **ε = 0.30**: 30% atsitiktinis, 70% geriausias


## Parametrų matrica

In [None]:
PARAM_GRID = {
    'alpha': [0.1, 0.3, 0.5],
    'gamma': [0.5, 0.9, 0.99],
    'epsilon': [0.05, 0.15, 0.30],
    'exploration_method': ['epsilon_greedy', 'thompson_sampling'],
    'strategy': ['A', 'B', 'C'],
    'similarity_method': ['cosine', 'euclidean'],
}

MODEL_NAMES = ['Q-Learning', 'SARSA', 'Expected-SARSA']

n_alpha = len(PARAM_GRID['alpha'])
n_gamma = len(PARAM_GRID['gamma'])
n_epsilon = len(PARAM_GRID['epsilon'])
n_exploration = len(PARAM_GRID['exploration_method'])
n_strategy = len(PARAM_GRID['strategy'])
n_similarity = len(PARAM_GRID['similarity_method'])
n_models = len(MODEL_NAMES)

total_combinations = n_models * n_alpha * n_gamma * n_epsilon * n_exploration * n_strategy * n_similarity

print(f"Alpha: {PARAM_GRID['alpha']}")
print(f"Gamma: {PARAM_GRID['gamma']}")
print(f"Epsilon: {PARAM_GRID['epsilon']}")
print(f"Tyrinėjimo metodai: {PARAM_GRID['exploration_method']}")
print(f"Strategijos: {PARAM_GRID['strategy']}")
print(f"Panašumo metodai: {PARAM_GRID['similarity_method']}")
print(f"Modeliai: {MODEL_NAMES}")
print(f"\nKombinacijų skaičius: {total_combinations}")

## Vaisių savybės

| Savybė | Aprašymas | Skalė |
|--------|-----------|-------|
| kietumas | Kiek kietas vaisius | 0 (minkštas) → 1 (kietas) |
| saldumas | Kiek saldus | 0 (nesaldus) → 1 (labai saldus) |
| rūgštingumas | Kiek rūgštus | 0 (nerūgštus) → 1 (labai rūgštus) |
| forma | Formos neįprastumas | 0 (apvalus) → 1 (pailgas) |
| tekstūra | Paviršiaus tekstūra | 0 (lygus) → 1 (šiurkštus) |
| spalva | Spalvos tonas | 0 (raudona) → 1 (mėlyna) |


In [None]:
FOOD_DATABASE = {
    'Obuolys': {'kietumas': 0.8, 'saldumas': 0.7, 'rugstingumas': 0.4, 'forma': 0.0, 'tekstura': 0.2, 'spalva': 0.0},
    'Žaliasis obuolys': {'kietumas': 0.8, 'saldumas': 0.5, 'rugstingumas': 0.7, 'forma': 0.0, 'tekstura': 0.2, 'spalva': 0.66},
    'Kriaušė': {'kietumas': 0.6, 'saldumas': 0.8, 'rugstingumas': 0.2, 'forma': 0.3, 'tekstura': 0.3, 'spalva': 0.5},
    'Bananas': {'kietumas': 0.3, 'saldumas': 0.9, 'rugstingumas': 0.1, 'forma': 1.0, 'tekstura': 0.0, 'spalva': 0.33},
    'Persikai': {'kietumas': 0.4, 'saldumas': 0.8, 'rugstingumas': 0.2, 'forma': 0.0, 'tekstura': 0.8, 'spalva': 0.15},
    'Mangas': {'kietumas': 0.5, 'saldumas': 0.9, 'rugstingumas': 0.3, 'forma': 0.2, 'tekstura': 0.1, 'spalva': 0.25},
    'Apelsinas': {'kietumas': 0.6, 'saldumas': 0.7, 'rugstingumas': 0.5, 'forma': 0.0, 'tekstura': 0.4, 'spalva': 0.2},
    'Mandarinas': {'kietumas': 0.5, 'saldumas': 0.8, 'rugstingumas': 0.4, 'forma': 0.0, 'tekstura': 0.3, 'spalva': 0.2},
    'Arbūzas': {'kietumas': 0.3, 'saldumas': 0.8, 'rugstingumas': 0.1, 'forma': 0.0, 'tekstura': 0.0, 'spalva': 0.0},
    'Melionas': {'kietumas': 0.4, 'saldumas': 0.7, 'rugstingumas': 0.1, 'forma': 0.0, 'tekstura': 0.1, 'spalva': 0.4},
    'Ananasas': {'kietumas': 0.6, 'saldumas': 0.8, 'rugstingumas': 0.6, 'forma': 0.5, 'tekstura': 0.9, 'spalva': 0.33},
    'Kiviai': {'kietumas': 0.5, 'saldumas': 0.6, 'rugstingumas': 0.5, 'forma': 0.0, 'tekstura': 0.6, 'spalva': 0.66},
    'Vyšnios': {'kietumas': 0.7, 'saldumas': 0.7, 'rugstingumas': 0.3, 'forma': 0.0, 'tekstura': 0.1, 'spalva': 0.0},
    'Braškės': {'kietumas': 0.4, 'saldumas': 0.8, 'rugstingumas': 0.2, 'forma': 0.4, 'tekstura': 0.5, 'spalva': 0.0},
    'Mėlynės': {'kietumas': 0.5, 'saldumas': 0.6, 'rugstingumas': 0.4, 'forma': 0.0, 'tekstura': 0.1, 'spalva': 1.0},
    'Avietės': {'kietumas': 0.3, 'saldumas': 0.7, 'rugstingumas': 0.3, 'forma': 0.1, 'tekstura': 0.7, 'spalva': 0.05},
    'Vynuogės': {'kietumas': 0.6, 'saldumas': 0.8, 'rugstingumas': 0.2, 'forma': 0.0, 'tekstura': 0.0, 'spalva': 0.8}
}

PROPERTY_COLS = ['kietumas', 'saldumas', 'rugstingumas', 'forma', 'tekstura', 'spalva']

food_data = [
    {'pavadinimas': name, **props}
    for name, props in FOOD_DATABASE.items()
]

FOOD_DF = pd.DataFrame(food_data)
FOOD_LIST = FOOD_DF['pavadinimas'].tolist()

print(f"{len(FOOD_DF)} produktų, {len(PROPERTY_COLS)} savybės")


## Panašumo metodai

### 1. Kosinuso panašumas (Cosine Similarity)

**Formulė:**
$$\text{cosine}(A, B) = \frac{A \cdot B}{\|A\| \times \|B\|} = \frac{\sum_{i=1}^{n} A_i \times B_i}{\sqrt{\sum_{i=1}^{n} A_i^2} \times \sqrt{\sum_{i=1}^{n} B_i^2}}$$

**Kur:**
- $A$ = pirmo vaisiaus savybių vektorius, pvz. Bananas = [0.3, 0.9, 0.1, 1.0, 0.0, 0.33]
- $B$ = antro vaisiaus savybių vektorius, pvz. Mangas = [0.5, 0.9, 0.3, 0.2, 0.1, 0.25]
- $A \cdot B$ = skaliarinė sandauga (dot product)
- $\|A\|$ = vektoriaus A ilgis (norma)
- $n$ = savybių skaičius (6)

**Rezultatas:**
- 0 = visiškai skirtingi (90° kampas)
- 1 = identiški (0° kampas)

---

### 2. Euklido panašumas (Euclidean Similarity)

**Atstumas:**
$$\text{distance}(A, B) = \sqrt{\sum_{i=1}^{n} (A_i - B_i)^2}$$

**Konversija į panašumą:**
$$\text{similarity} = \frac{1}{1 + \text{distance}}$$

**Kur:**
- $(A_i - B_i)^2$ = skirtumo kvadratas kiekvienai savybei
- Konversija reikalinga nes atstumas = kuo didesnis, tuo mažiau panašūs

**Rezultatas:**
- ~0 = labai skirtingi (didelis atstumas)
- 1 = identiški (atstumas = 0)


In [None]:
def compute_similarity_matrix(method='cosine'):

    vectors = FOOD_DF[PROPERTY_COLS].values
    n = len(vectors)

    if method == 'cosine':
        norms = np.linalg.norm(vectors, axis=1, keepdims=True)
        norms[norms == 0] = 1
        normalized = vectors / norms
        sim = normalized @ normalized.T
    else:  # euclidean
        sim = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                if i != j:
                    dist = np.sqrt(np.sum((vectors[i] - vectors[j])**2))
                    sim[i][j] = 1 / (1 + dist)

    np.fill_diagonal(sim, 0)
    return sim

SIMILARITY_MATRICES = {
    'cosine': compute_similarity_matrix('cosine'),
    'euclidean': compute_similarity_matrix('euclidean')
}

---
## Būsena (STATE)


**State:**
$$S = (rejections, last\_action)$$

**Kur:**
- $rejections$ = kiek kartų vaikas atmetė šioje sesijoje (0, 1, 2, 3)
- $last\_action$ = paskutinis veiksmas ('start', 'success', 'rejection')

**Galimos būsenos:**
```
S(0, start)     - Sesijos pradžia
S(0, success)   - Atiktis be atmetimų
S(1, rejection) - Po pirmo atmetimo
S(2, rejection) - Po antro atmetimo  
S(3, rejection) - Po trečio atmetimo
```

**Viso galimų būsenų:** 4 × 3 = 12

---

## Būsena

In [None]:
@dataclass(frozen=True)
class State:
    rejections: int
    last_action: str

    def __repr__(self):
        return f"S(rej={self.rejections},{self.last_action[:3]})"

## Virtualaus vaiko norų sąrašas



In [None]:
CHILD_WANTS = [
    "Bananas", "Bananas", "Mangas", "Bananas", "Mangas",
    "Bananas", "Bananas", "Mangas", "Bananas", "Mandarinas",
    "Bananas", "Bananas", "Mangas", "Bananas", "Mangas",
    "Bananas", "Bananas", "Obuolys", "Obuolys", "Kriaušė",
    "Kriaušė", "Bananas", "Obuolys", "Kriaušė", "Bananas",
    "Obuolys", "Kriaušė", "Mandarinas", "Obuolys", "Kriaušė",
    "Bananas", "Obuolys", "Obuolys", "Kriaušė", "Bananas",
    "Mangas", "Bananas", "Obuolys", "Kriaušė", "Mandarinas",
    "Bananas", "Bananas", "Mėlynės", "Mėlynės", "Vynuogės",
    "Vynuogės", "Bananas", "Mėlynės", "Vynuogės", "Mandarinas",
    "Mėlynės", "Vynuogės", "Bananas", "Obuolys", "Kriaušė",
    "Mėlynės", "Vynuogės", "Bananas", "Mandarinas", "Mėlynės",
    "Vynuogės", "Bananas", "Mangas", "Bananas", "Bananas",
    "Obuolys", "Bananas", "Mandarinas", "Bananas", "Kriaušė",
    "Bananas", "Mėlynės", "Bananas", "Vynuogės", "Bananas",
    "Bananas", "Mandarinas", "Bananas", "Mangas", "Bananas",
    "Obuolys", "Bananas", "Obuolys", "Bananas", "Bananas",
    "Mėlynės", "Mandarinas", "Obuolys", "Mandarinas", "Bananas",
    "Mandarinas", "Obuolys", "Mangas", "Mėlynės", "Bananas",
    "Vynuogės", "Mangas", "Mėlynės", "Obuolys", "Bananas"
]

## Rezultatų logger'is

In [None]:
class ExtendedResultsLogger:

    def __init__(self):
        self.records = []
        self.cumulative_success = 0
        self.cumulative_rejection = 0
        self.recent_results = deque(maxlen=20)
        self.current_success_streak = 0
        self.current_fail_streak = 0
        self.max_success_streak = 0
        self.max_fail_streak = 0

    def reset_cumulative(self):
        self.cumulative_success = 0
        self.cumulative_rejection = 0
        self.recent_results.clear()
        self.current_success_streak = 0
        self.current_fail_streak = 0
        self.max_success_streak = 0
        self.max_fail_streak = 0

    def log(self,
            # Pagrindiniai
            session: int,
            wanted: str,
            card1: str,
            card2: str,
            result: str,  # 'SUCCESS' arba 'REJECTION'
            chosen: str,  # Pasirinkta kortelė arba ''
            set_number: int,
            state: State,
            selection_reason: str,

            # Konfigūracija
            model_name: str,
            exploration: str,
            strategy: str,
            alpha: float,
            gamma: float,
            epsilon: float,
            similarity_method: str,

            # Q reikšmės
            q1_before: float,
            q2_before: float,
            q1_after: float,
            q2_after: float,

            # Tyrinėjimas
            was_exploration: bool,
            best_q_card: str,
            best_q_value: float,

            # Thompson Sampling (jei taikoma)
            ts_sample_card1: Optional[float],
            ts_sample_card2: Optional[float],

            # Panašumas
            similarity_card1_to_rejected: float,
            similarity_card2_to_rejected: float,

            # Sesija
            session_success: bool,
            ):

        if result == 'SUCCESS':
            self.cumulative_success += 1
            self.current_success_streak += 1
            self.current_fail_streak = 0
            self.max_success_streak = max(self.max_success_streak, self.current_success_streak)
        else:
            self.cumulative_rejection += 1
            self.current_fail_streak += 1
            self.current_success_streak = 0
            self.max_fail_streak = max(self.max_fail_streak, self.current_fail_streak)

        self.recent_results.append(1 if result == 'SUCCESS' else 0)

        last_10_rate = sum(list(self.recent_results)[-10:]) / min(10, len(self.recent_results)) if self.recent_results else 0
        last_20_rate = sum(self.recent_results) / len(self.recent_results) if self.recent_results else 0

        q1_change = q1_after - q1_before
        q2_change = q2_after - q2_before

        record = {
            # Pagrindiniai
            'session': session,
            'set_number': set_number,
            'wanted': wanted,
            'card1': card1,
            'card2': card2,
            'result': result,
            'chosen': chosen,
            'state': str(state),
            'selection_reason': selection_reason,

            # Konfigūracija
            'model': model_name,
            'exploration': exploration,
            'strategy': strategy,
            'alpha': alpha,
            'gamma': gamma,
            'epsilon': epsilon,
            'similarity_method': similarity_method,

            # Q reikšmės
            'q1_before': round(q1_before, 4),
            'q2_before': round(q2_before, 4),
            'q1_after': round(q1_after, 4),
            'q2_after': round(q2_after, 4),
            'q1_change': round(q1_change, 4),
            'q2_change': round(q2_change, 4),

            # Tyrinėjimas
            'was_exploration': was_exploration,
            'best_q_card': best_q_card,
            'best_q_value': round(best_q_value, 4),

            # Thompson Sampling (jei taikoma)
            'ts_sample_card1': round(ts_sample_card1, 4) if ts_sample_card1 is not None else None,
            'ts_sample_card2': round(ts_sample_card2, 4) if ts_sample_card2 is not None else None,

            # Panašumas
            'sim_card1_to_rejected': round(similarity_card1_to_rejected, 4),
            'sim_card2_to_rejected': round(similarity_card2_to_rejected, 4),

            # Kaupiamieji
            'cumulative_success': self.cumulative_success,
            'cumulative_rejection': self.cumulative_rejection,

            # Mokymosi progresas
            'last_10_success_rate': round(last_10_rate, 4),
            'last_20_success_rate': round(last_20_rate, 4),

            # Serijos
            'current_success_streak': self.current_success_streak,
            'current_fail_streak': self.current_fail_streak,
            'max_success_streak': self.max_success_streak,

            # Sesija
            'session_success': session_success,
        }
        self.records.append(record)

    def get_dataframe(self) -> pd.DataFrame:
        return pd.DataFrame(self.records)

    def get_csv_string(self) -> str:
        return self.get_dataframe().to_csv(index=False)

    def save_csv(self, filename: str):
        self.get_dataframe().to_csv(filename, index=False)
        print(f"Išsaugota: {filename}")

    def clear(self):
        self.records = []
        self.reset_cumulative()


LOGGER = ExtendedResultsLogger()


## Tyrinėjimo metodai
---
### 1. Epsilon-Greedy

**Formulė:**
$$a = \begin{cases} \text{random}(A) & \text{su tikimybe } \epsilon \\ \arg\max_a Q(s,a) & \text{su tikimybe } 1-\epsilon \end{cases}$$

**Pavyzdys su ε = 0.15:**
```
Q(Bananas) = 0.8, Q(Obuolys) = 0.3, Q(Kiviai) = 0.1

random() = 0.23 (> 0.15) → Renkasi Bananą (max Q)
random() = 0.08 (< 0.15) → Renkasi atsitiktinai (gal Kivius!)
```

---

### 2. Thompson Sampling

**Formulė:**
$$\theta_i \sim \text{Beta}(\alpha_i, \beta_i)$$
$$a = \arg\max_i \theta_i$$

**Kur:**
- $\alpha_i$ = atitikčių skaičius + 1
- $\beta_i$ = neatitikčių skaičius + 1
- $\theta_i$ = atsitiktinė reikšmė iš Beta pasiskirstymo


**Pavyzdys:**
```
Bananas: 10 atitikčių, 2 neatitiktys → Beta(11, 3)
Obuolys: 3 atiktys, 5 neatiktys  → Beta(4, 6)
Kiviai:  1 atiktis, 8 neatiktys   → Beta(2, 9)

Imame atsiktinę reikšmę:
θ_Bananas = sample(Beta(11,3)) = 0.78
θ_Obuolys = sample(Beta(4,6))  = 0.42
θ_Kiviai  = sample(Beta(2,9))  = 0.15

→ Renkasi Bananą (max θ = 0.78)
```

## Strategijos

### Strategija A: Panaši + Priešinga (Gylio/Pločio)

```
Rinkinys 1 (rejections=0):
    → TOP 2 kortelės (pagal Q arba Thompson Sampling)

Rinkinys 2+ (rejections≥1):
    → 1 PANAŠIAUSIA į atmestąsias
    → 1 PRIEŠINGIAUSIA
```

---

### Strategija B: 2 Panašiausios (Gylio)

```
Rinkinys 1 (rejections=0):
    → TOP 2 kortelės

Rinkinys 2+ (rejections≥1):
    → 2 PANAŠIAUSIOS į atmestąsias
```

---

### Strategija C: Mišri (Gylio/Dažnio)

```
Rinkinys 1 (rejections=0):
    → TOP 2 kortelės

Rinkinys 2 (rejections=1):
    → 1 PANAŠIAUSIA į atmestąsias
    → 1 iš TOP (dar nerodyta)

Rinkinys 3 (rejections=2):
    → 2 PANAŠIAUSIOS
```

---

## Bazinis RL modelis

In [None]:
class BaseRLModel:
    def __init__(self, alpha: float, gamma: float, epsilon: float,
                 exploration_method: str, strategy: str, similarity_method: str):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.exploration_method = exploration_method
        self.strategy = strategy
        self.similarity_method = similarity_method
        self.name = "Base"

        self.q_table: Dict[State, Dict[str, float]] = defaultdict(lambda: defaultdict(float))

        self.successes: Dict[str, int] = defaultdict(int)
        self.failures: Dict[str, int] = defaultdict(int)

        self.current_state: State = State(0, 'start')
        self.session_shown: List[str] = []
        self.last_rejected: List[str] = []

        self.similarity_matrix = SIMILARITY_MATRICES[similarity_method]

        self.last_was_exploration = False
        self.last_ts_samples: Dict[str, float] = {}

    def get_q(self, state: State, food: str) -> float:
        return self.q_table[state][food]

    def get_best_q_info(self, state: State, available: List[str]) -> Tuple[str, float]:
        q_vals = [(f, self.get_q(state, f)) for f in available]
        q_vals.sort(key=lambda x: x[1], reverse=True)
        return q_vals[0][0], q_vals[0][1]

    def epsilon_greedy_select(self, state: State, available: List[str]) -> str:
        if random.random() < self.epsilon:
            self.last_was_exploration = True
            return random.choice(available)
        else:
            self.last_was_exploration = False
            best, _ = self.get_best_q_info(state, available)
            return best

    def thompson_sampling_select(self, available: List[str]) -> str:
        self.last_ts_samples = {}
        for food in available:
            alpha = self.successes[food] + 1
            beta = self.failures[food] + 1
            self.last_ts_samples[food] = np.random.beta(alpha, beta)

        sorted_foods = sorted(self.last_ts_samples.items(), key=lambda x: x[1], reverse=True)
        self.last_was_exploration = False
        return sorted_foods[0][0]

    def select_by_exploration(self, state: State, available: List[str]) -> str:
        if self.exploration_method == 'thompson_sampling':
            return self.thompson_sampling_select(available)
        else:
            return self.epsilon_greedy_select(state, available)

    def get_top_cards(self, state: State, available: List[str], n: int = 2) -> List[str]:
        if self.exploration_method == 'thompson_sampling':
            self.last_ts_samples = {}
            for food in available:
                alpha = self.successes[food] + 1
                beta = self.failures[food] + 1
                self.last_ts_samples[food] = np.random.beta(alpha, beta)
            sorted_foods = sorted(self.last_ts_samples.items(), key=lambda x: x[1], reverse=True)
            self.last_was_exploration = False
            return [f[0] for f in sorted_foods[:n]]
        else:
            result = []
            remaining = available.copy()
            for _ in range(n):
                if not remaining:
                    break
                card = self.epsilon_greedy_select(state, remaining)
                result.append(card)
                remaining.remove(card)
            return result

    def get_similarity_to_rejected(self, card: str) -> float:
        if not self.last_rejected:
            return 0.0
        card_idx = FOOD_LIST.index(card)
        sims = [self.similarity_matrix[card_idx][FOOD_LIST.index(ref)]
                for ref in self.last_rejected if ref in FOOD_LIST]
        return np.mean(sims) if sims else 0.0

    def get_most_similar(self, reference_cards: List[str], available: List[str], n: int = 1) -> List[str]:
        if not reference_cards or not available:
            return available[:n]
        similarities = []
        for food in available:
            food_idx = FOOD_LIST.index(food)
            avg_sim = np.mean([self.similarity_matrix[food_idx][FOOD_LIST.index(ref)]
                              for ref in reference_cards if ref in FOOD_LIST])
            similarities.append((food, avg_sim))
        similarities.sort(key=lambda x: x[1], reverse=True)
        return [s[0] for s in similarities[:n]]

    def get_most_opposite(self, reference_cards: List[str], available: List[str], n: int = 1) -> List[str]:
        if not reference_cards or not available:
            return available[:n]
        similarities = []
        for food in available:
            food_idx = FOOD_LIST.index(food)
            avg_sim = np.mean([self.similarity_matrix[food_idx][FOOD_LIST.index(ref)]
                              for ref in reference_cards if ref in FOOD_LIST])
            similarities.append((food, avg_sim))
        similarities.sort(key=lambda x: x[1])
        return [s[0] for s in similarities[:n]]

    def select_cards(self) -> Tuple[str, str, str]:
        available = [f for f in FOOD_LIST if f not in self.session_shown]
        if len(available) < 2:
            self.session_shown = []
            available = FOOD_LIST.copy()

        rejections = self.current_state.rejections

        if self.strategy == 'A':
            return self._strategy_A(available, rejections)
        elif self.strategy == 'B':
            return self._strategy_B(available, rejections)
        else:
            return self._strategy_C(available, rejections)

    def _strategy_A(self, available: List[str], rejections: int) -> Tuple[str, str, str]:
        if rejections == 0:
            cards = self.get_top_cards(self.current_state, available, 2)
            reason = f"TOP2({self.exploration_method})"
        else:
            similar = self.get_most_similar(self.last_rejected, available, 1)
            remaining = [f for f in available if f not in similar]
            opposite = self.get_most_opposite(self.last_rejected, remaining, 1) if remaining else similar
            cards = similar + opposite
            reason = f"Panasus+Priesingas(rej={rejections})"
        self.session_shown.extend(cards[:2])
        return (cards[0], cards[1] if len(cards) > 1 else cards[0], reason)

    def _strategy_B(self, available: List[str], rejections: int) -> Tuple[str, str, str]:
        if rejections == 0:
            cards = self.get_top_cards(self.current_state, available, 2)
            reason = f"TOP2({self.exploration_method})"
        else:
            cards = self.get_most_similar(self.last_rejected, available, 2)
            reason = f"2Panasus(rej={rejections})"
        self.session_shown.extend(cards[:2])
        return (cards[0], cards[1] if len(cards) > 1 else cards[0], reason)

    def _strategy_C(self, available: List[str], rejections: int) -> Tuple[str, str, str]:
        if rejections == 0:
            cards = self.get_top_cards(self.current_state, available, 2)
            reason = f"TOP2({self.exploration_method})"
        elif rejections == 1:
            similar = self.get_most_similar(self.last_rejected, available, 1)
            remaining = [f for f in available if f not in similar]
            top_remaining = self.get_top_cards(self.current_state, remaining, 1) if remaining else similar
            cards = similar + top_remaining
            reason = "Panasus+TOP"
        else:
            cards = self.get_most_similar(self.last_rejected, available, 2)
            reason = "2Panasus"
        self.session_shown.extend(cards[:2])
        return (cards[0], cards[1] if len(cards) > 1 else cards[0], reason)

    def start_new_session(self):
        self.session_shown = []
        self.last_rejected = []
        self.current_state = State(0, 'start')

    def record_rejection(self, cards: List[str]):
        self.last_rejected = cards.copy()
        for card in cards:
            self.failures[card] += 1

    def record_success(self, chosen: str, other: str):
        self.successes[chosen] += 1
        self.failures[other] += 1

    def update(self, state: State, action: Tuple[str, str], reward: float,
               next_state: State, chosen: Optional[str] = None) -> Tuple[float, float]:
        raise NotImplementedError

## Algoritmai


---



## Bendra TD (Temporal Difference) formulė:

$$Q(s,a) \leftarrow Q(s,a) + \alpha \cdot [\underbrace{r + \gamma \cdot V(s')}_{\text{TD Target}} - Q(s,a)]$$

**Kur:**
- $Q(s,a)$ = dabartinis kortelės įvertinimas būsenoje s
- $\alpha$ = mokymosi greitis (learning rate)
- $r$ = gautas apdovanojimas (+1 sėkmė, -1 atmetimas)
- $\gamma$ = ateities nuolaida (discount factor)
- $V(s')$ = kitos būsenos vertė (skiriasi priklausomai nuo algoritmo)
- $[...]$ = TD Error (skirtumas tarp tikėtino ir faktinio)

---

## 1. Q-Learning

**Formulė:**
$$Q(s,a) \leftarrow Q(s,a) + \alpha \cdot [r + \gamma \cdot \max_{a'} Q(s',a') - Q(s,a)]$$

**Kur:**
- $\max_{a'} Q(s',a')$ = **geriausio** galimo veiksmo vertė kitoje būsenoje

---

## 2. SARSA (State-Action-Reward-State-Action)

**Formulė:**
$$Q(s,a) \leftarrow Q(s,a) + \alpha \cdot [r + \gamma \cdot Q(s',a') - Q(s,a)]$$

**Kur:**
- $Q(s',a')$ = **faktinio** kito veiksmo vertė (to, kurį iš tikrųjų padarė)


---

## 3. Expected SARSA

**Formulė:**
$$Q(s,a) \leftarrow Q(s,a) + \alpha \cdot [r + \gamma \cdot \mathbb{E}[Q(s',a')] - Q(s,a)]$$

**Kur:**
$$\mathbb{E}[Q(s',a')] = (1-\epsilon) \cdot \max_{a'} Q(s',a') + \epsilon \cdot \frac{1}{|A|} \sum_{a'} Q(s',a')$$

**Arba paprasčiau:**
- $\mathbb{E}[Q(s',a')]$ = **tikėtina** kito veiksmo vertė (vidurkis pagal tikimybes)



In [None]:
class QLearningModel(BaseRLModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.name = "Q-Learning"

    def update(self, state: State, action: Tuple[str, str], reward: float,
               next_state: State, chosen: Optional[str] = None) -> Tuple[float, float]:
        card1, card2 = action

        max_next_q = max(self.get_q(next_state, f) for f in FOOD_LIST)

        reward_other = -0.2

        if chosen is None:
            for card in [card1, card2]:
                old_q = self.get_q(state, card)
                # TD Target = r + γ·max Q(s',a')
                td_target = reward + self.gamma * max_next_q
                # Q ← Q + α·(TD_target - Q)
                self.q_table[state][card] = old_q + self.alpha * (td_target - old_q)
            return (self.get_q(state, card1), self.get_q(state, card2))
        else:
            other = card2 if chosen == card1 else card1

            old_q = self.get_q(state, chosen)
            td_target = reward + self.gamma * max_next_q
            self.q_table[state][chosen] = old_q + self.alpha * (td_target - old_q)

            old_q_other = self.get_q(state, other)
            td_target_other = reward_other + self.gamma * max_next_q
            self.q_table[state][other] = old_q_other + self.alpha * (td_target_other - old_q_other)

            return (self.get_q(state, card1), self.get_q(state, card2))


class SARSAModel(BaseRLModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.name = "SARSA"

    def update(self, state: State, action: Tuple[str, str], reward: float,
               next_state: State, chosen: Optional[str] = None) -> Tuple[float, float]:
        card1, card2 = action

        available = [f for f in FOOD_LIST if f not in self.session_shown]
        if len(available) >= 2:
            next_card1 = self.select_by_exploration(next_state, available)
            next_card2 = self.select_by_exploration(next_state, [f for f in available if f != next_card1])
            next_q = (self.get_q(next_state, next_card1) + self.get_q(next_state, next_card2)) / 2
        else:
            next_q = 0

        reward_other = -0.2

        if chosen is None:
            for card in [card1, card2]:
                old_q = self.get_q(state, card)
                td_target = reward + self.gamma * next_q
                self.q_table[state][card] = old_q + self.alpha * (td_target - old_q)
            return (self.get_q(state, card1), self.get_q(state, card2))
        else:
            other = card2 if chosen == card1 else card1

            old_q = self.get_q(state, chosen)
            td_target = reward + self.gamma * next_q
            self.q_table[state][chosen] = old_q + self.alpha * (td_target - old_q)

            old_q_other = self.get_q(state, other)
            td_target_other = reward_other + self.gamma * next_q
            self.q_table[state][other] = old_q_other + self.alpha * (td_target_other - old_q_other)

            return (self.get_q(state, card1), self.get_q(state, card2))


class ExpectedSARSAModel(BaseRLModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.name = "Expected-SARSA"

    def get_expected_q(self, state: State, available: List[str]) -> float:
        if not available:
            return 0.0
        q_vals = [self.get_q(state, f) for f in available]
        max_q = max(q_vals)
        mean_q = sum(q_vals) / len(q_vals)
        # Tikėtina Q = (1-ε)·geriausias + ε·vidurkis
        return (1 - self.epsilon) * max_q + self.epsilon * mean_q

    def update(self, state: State, action: Tuple[str, str], reward: float,
               next_state: State, chosen: Optional[str] = None) -> Tuple[float, float]:
        card1, card2 = action

        available = [f for f in FOOD_LIST if f not in self.session_shown]
        if len(available) < 2:
            available = FOOD_LIST.copy()

        # E[Q(s',a')]
        expected_q = self.get_expected_q(next_state, available)
        reward_other = -0.2

        if chosen is None:
            for card in [card1, card2]:
                old_q = self.get_q(state, card)
                td_target = reward + self.gamma * expected_q
                self.q_table[state][card] = old_q + self.alpha * (td_target - old_q)
            return (self.get_q(state, card1), self.get_q(state, card2))
        else:
            other = card2 if chosen == card1 else card1

            old_q = self.get_q(state, chosen)
            td_target = reward + self.gamma * expected_q
            self.q_table[state][chosen] = old_q + self.alpha * (td_target - old_q)

            old_q_other = self.get_q(state, other)
            td_target_other = reward_other + self.gamma * expected_q
            self.q_table[state][other] = old_q_other + self.alpha * (td_target_other - old_q_other)

            return (self.get_q(state, card1), self.get_q(state, card2))


MODEL_CLASSES = {
    'Q-Learning': QLearningModel,
    'SARSA': SARSAModel,
    'Expected-SARSA': ExpectedSARSAModel
}

##Virtualaus vaiko simuliacija

```
Jei norimas vaisius yra tarp rodomų kortelių:
    → PASIRENKA
Kitaip:
    → ATMETA
```

In [None]:
class VirtualChild:
    def __init__(self, wants_list: List[str]):
        self.wants_list = wants_list
        self.current_want_index = 0

    def get_current_want(self) -> Optional[str]:
        if self.current_want_index < len(self.wants_list):
            return self.wants_list[self.current_want_index]
        return None

    def next_want(self):
        self.current_want_index += 1

    def react_to_cards(self, card1: str, card2: str) -> Optional[str]:
        want = self.get_current_want()
        if want == card1:
            return card1
        elif want == card2:
            return card2
        else:
            return None

    def reset(self):
        self.current_want_index = 0


def run_simulation(model: BaseRLModel, child: VirtualChild, logger: ExtendedResultsLogger) -> Dict:

    child.reset()
    model.start_new_session()
    logger.reset_cumulative()

    session_num = 0
    sessions_succeeded = 0
    sessions_failed = 0

    reward_success = 1.0
    reward_rejection = -1.0

    while child.get_current_want() is not None:
        session_num += 1
        wanted = child.get_current_want()
        model.start_new_session()

        session_success = False

        for set_number in range(1, 4):
            state = model.current_state
            card1, card2, reason = model.select_cards()

            q1_before = model.get_q(state, card1)
            q2_before = model.get_q(state, card2)

            best_q_card, best_q_value = model.get_best_q_info(state, FOOD_LIST)

            sim1 = model.get_similarity_to_rejected(card1)
            sim2 = model.get_similarity_to_rejected(card2)

            ts1 = model.last_ts_samples.get(card1) if model.exploration_method == 'thompson_sampling' else None
            ts2 = model.last_ts_samples.get(card2) if model.exploration_method == 'thompson_sampling' else None

            chosen = child.react_to_cards(card1, card2)

            if chosen is not None:
                other = card2 if chosen == card1 else card1
                next_state = State(0, 'success')

                q1_after, q2_after = model.update(state, (card1, card2), reward_success, next_state, chosen)
                model.record_success(chosen, other)
                model.current_state = next_state

                logger.log(
                    session=session_num, wanted=wanted, card1=card1, card2=card2,
                    result='SUCCESS', chosen=chosen, set_number=set_number, state=state,
                    selection_reason=reason, model_name=model.name,
                    exploration=model.exploration_method, strategy=model.strategy,
                    alpha=model.alpha, gamma=model.gamma, epsilon=model.epsilon,
                    similarity_method=model.similarity_method,
                    q1_before=q1_before, q2_before=q2_before, q1_after=q1_after, q2_after=q2_after,
                    was_exploration=model.last_was_exploration,
                    best_q_card=best_q_card, best_q_value=best_q_value,
                    ts_sample_card1=ts1, ts_sample_card2=ts2,
                    similarity_card1_to_rejected=sim1, similarity_card2_to_rejected=sim2,
                    session_success=True
                )

                session_success = True
                sessions_succeeded += 1
                break
            else:
                next_state = State(min(state.rejections + 1, 3), 'rejection')

                q1_after, q2_after = model.update(state, (card1, card2), reward_rejection, next_state, None)
                model.record_rejection([card1, card2])
                model.current_state = next_state

                logger.log(
                    session=session_num, wanted=wanted, card1=card1, card2=card2,
                    result='REJECTION', chosen='', set_number=set_number, state=state,
                    selection_reason=reason, model_name=model.name,
                    exploration=model.exploration_method, strategy=model.strategy,
                    alpha=model.alpha, gamma=model.gamma, epsilon=model.epsilon,
                    similarity_method=model.similarity_method,
                    q1_before=q1_before, q2_before=q2_before, q1_after=q1_after, q2_after=q2_after,
                    was_exploration=model.last_was_exploration,
                    best_q_card=best_q_card, best_q_value=best_q_value,
                    ts_sample_card1=ts1, ts_sample_card2=ts2,
                    similarity_card1_to_rejected=sim1, similarity_card2_to_rejected=sim2,
                    session_success=False
                )

        if not session_success:
            sessions_failed += 1

        child.next_want()

    return {
        'model': model.name,
        'exploration': model.exploration_method,
        'strategy': model.strategy,
        'alpha': model.alpha,
        'gamma': model.gamma,
        'epsilon': model.epsilon,
        'similarity': model.similarity_method,
        'total_sessions': session_num,
        'sessions_succeeded': sessions_succeeded,
        'sessions_failed': sessions_failed,
        'success_rate': sessions_succeeded / session_num * 100 if session_num > 0 else 0,
        'max_success_streak': logger.max_success_streak
    }

## Simuliacijos paleidimas

In [None]:
LOGGER.clear()
results_summary = []

all_combinations = list(itertools.product(
    MODEL_CLASSES.keys(),
    PARAM_GRID['exploration_method'],
    PARAM_GRID['strategy'],
    PARAM_GRID['alpha'],
    PARAM_GRID['gamma'],
    PARAM_GRID['epsilon'],
    PARAM_GRID['similarity_method']
))

print(f"PALEIDŽIAMOS {len(all_combinations)} SIMULIACIJOS...")
print("="*50)

for i, (model_name, expl, strat, alpha, gamma, eps, sim) in enumerate(all_combinations):
    ModelClass = MODEL_CLASSES[model_name]
    model = ModelClass(
        alpha=alpha,
        gamma=gamma,
        epsilon=eps,
        exploration_method=expl,
        strategy=strat,
        similarity_method=sim
    )

    child = VirtualChild(CHILD_WANTS)

    result = run_simulation(model, child, LOGGER)
    results_summary.append(result)

    if (i + 1) % 50 == 0 or (i + 1) == len(all_combinations):
        print(f"   [{i+1}/{len(all_combinations)}] Baigta...")

print("="*50)
print(f"\nIš viso {len(results_summary)} simuliacijų")

## TOP konfigūracijų suvestinė

In [None]:
summary_df = pd.DataFrame(results_summary)
summary_df = summary_df.sort_values('success_rate', ascending=False)

print(f"REZULTATŲ SUVESTINĖ ({len(summary_df)} kombinacijų)")
print("="*50)

print("\nTOP 20 KONFIGŪRACIJŲ:")
display(summary_df.head(20))

## Eksportuoti rezultatus

In [None]:
summary_df.to_csv('suvestine.csv', index=False)
print("Išsaugota: suvestine.csv")

LOGGER.save_csv('detalus_rezultatai.csv')

print(f"\nSuvestinė: {len(summary_df)} eilučių")
print(f"Detalūs rezultatai: {len(LOGGER.records)} eilučių")

## Trumpa rezultatų analizė

In [None]:
for param in ['model', 'exploration', 'strategy', 'alpha', 'gamma', 'epsilon', 'similarity']:
    print(f"\n{param.upper()}:")
    stats = summary_df.groupby(param)['success_rate'].agg(['mean', 'std']).round(2)
    stats = stats.sort_values('mean', ascending=False)
    for idx, row in stats.iterrows():
        print(f"  {idx}: {row['mean']:.1f}% (±{row['std']:.1f}%)")