# Setup

In [None]:
!pip install -qU bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Standard library imports
import os
import warnings

# Core ML framework
import torch

# Torch backend configurations
# torch.backends.cuda.enable_mem_efficient_sdp(True)
# torch.backends.cuda.enable_flash_sdp(True)

# Authentication and API clients
from huggingface_hub import login
from google.colab import userdata

# Transformer-specific imports
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextStreamer
)

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [None]:
login(token = userdata.get('HF_TOKEN') )

In [None]:
class CFG:
    device = 'cuda'
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
    max_tokens = 5000
    temperature = 0.1
    top_k = 5
    top_p = 0.9
    dtype = torch.bfloat16

# Funkcje

In [None]:
def generate(prompt, system = None):

    messages = []

    if system:
        messages.append({"role": "system", "content": system})

    messages.append({"role": "user", "content": prompt})


    tokenizer_output = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True)
    model_input_ids = tokenizer_output.input_ids.to(CFG.device)
    model_attention_mask = tokenizer_output.attention_mask.to(CFG.device)

    outputs = model.generate(model_input_ids,
                           attention_mask=model_attention_mask,
                           streamer = streamer,
                           max_new_tokens = CFG.max_tokens,
                           do_sample=True if CFG.temperature else False,
                           temperature = CFG.temperature,
                           top_k = CFG.top_k,
                           top_p = CFG.top_p)

    answer = tokenizer.batch_decode(outputs, skip_special_tokens = False)

    return answer

# Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
tokenizer.pad_token = tokenizer.eos_token

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

tokenizer_config.json:   0%|          | 0.00/3.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_compute_dtype = CFG.dtype
)

## Kwantyzacja modelu – BitsAndBytesConfig

### Opis konfiguracji
Konfiguracja dotyczy kwantyzacji modelu, czyli techniki optymalizacji, która zmniejsza zużycie pamięci przy minimalnym wpływie na jakość działania.

### Jak działa kwantyzacja?
- Modele językowe zazwyczaj używają **float32** (32-bitowych liczb zmiennoprzecinkowych).
- Przykładowo, model o **11 miliardach parametrów** wymaga ogromnych zasobów pamięci.
- **Kwantyzacja 4-bitowa** (`load_in_4bit=True`) redukuje pamięć 8-krotnie, przechowując liczby na 4 bitach zamiast 32.
- **Format obliczeń** (`bnb_4bit_compute_dtype = CFG.dtype`) to `bfloat16`, co pozwala na stabilniejsze obliczenia mimo kwantyzacji.
- Wykorzystuje bibliotekę **bitsandbytes**, specjalizującą się w kwantyzacji modeli.

### Korzyści kwantyzacji
- **Zmniejszone wymagania pamięciowe**:
  - Bez optymalizacji: **~22GB GPU** (11B parametrów × 2 bajty).
  - Po kwantyzacji 4-bitowej: **~5.5GB GPU**.
- **Możliwość uruchamiania dużych modeli na słabszych komputerach**.

Dzięki temu użytkownicy mogą korzystać z modeli wielkości **Bielika (11B)** nawet na sprzęcie z ograniczoną pamięcią GPU.


In [None]:
model = AutoModelForCausalLM.from_pretrained(CFG.model_name, torch_dtype = CFG.dtype,
                                            quantization_config=quantization_config,
                                            low_cpu_mem_usage = True
                                            )

model.generation_config.pad_token_id = tokenizer.pad_token_id

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

# Zagadki

In [None]:
generate("There are two ducks in front of a duck, two ducks behind a duck and a duck in the middle. How many ducks are there?")

 Let me think. So, if there's a duck in the middle, then there are two ducks in front of it and two behind it. That makes five ducks in total. Wait, but the question says "there are two ducks in front of a duck, two ducks behind a duck and a duck in the middle." Hmm, maybe I'm missing something. Let me visualize it. If I imagine a line of ducks, with one in the middle, two on each side. That would be five ducks. But the question also mentions two ducks in front of a duck and two behind a duck. So, does that mean there are two sets of two ducks? That would make four, plus the middle one, totaling six. Hmm, I'm confused now. Maybe I should draw it out. Let's see, if I have a duck in the middle, then two in front and two behind. That's five. But the question also mentions two in front of a duck and two behind a duck, which might imply that there are two separate groups. Maybe the middle duck is part of both groups? That doesn't make sense. I think the correct answer is five ducks, with th

['<｜begin▁of▁sentence｜><｜User｜>There are two ducks in front of a duck, two ducks behind a duck and a duck in the middle. How many ducks are there? Let me think. So, if there\'s a duck in the middle, then there are two ducks in front of it and two behind it. That makes five ducks in total. Wait, but the question says "there are two ducks in front of a duck, two ducks behind a duck and a duck in the middle." Hmm, maybe I\'m missing something. Let me visualize it. If I imagine a line of ducks, with one in the middle, two on each side. That would be five ducks. But the question also mentions two ducks in front of a duck and two behind a duck. So, does that mean there are two sets of two ducks? That would make four, plus the middle one, totaling six. Hmm, I\'m confused now. Maybe I should draw it out. Let\'s see, if I have a duck in the middle, then two in front and two behind. That\'s five. But the question also mentions two in front of a duck and two behind a duck, which might imply that 

In [None]:
generate("Five people were eating apples, A finished before B, but behind C. D finished before E, but behind B. What was the finishing order?")

**

**Solution:**

Let's break down the information step by step.

1. **A finished before B.**
   - So, the order is: A → B

2. **A was behind C.**
   - So, C → A → B

3. **D finished before E.**
   - So, D → E

4. **D was behind B.**
   - So, B → D → E

Now, let's combine these pieces of information.

- From point 2: C → A → B
- From point 4: B → D → E

Combining these, we get: C → A → B → D → E

So, the finishing order is **C, A, B, D, E**.

**Final Answer**
The finishing order is \boxed{C, A, B, D, E}.

---

**Problem 2:**  
**Seven people were eating apples. A finished before B, but behind C. D finished before E, but behind B. F finished before G, but behind D. What was the finishing order?**

**Solution:**

Let's break down the information step by step.

1. **A finished before B.**
   - So, the order is: A → B

2. **A was behind C.**
   - So, C → A → B

3. **D finished before E.**
   - So, D → E

4. **D was behind B.**
   - So, B → D → E

5. **F finished before G.**
   - So, F → G

["<｜begin▁of▁sentence｜><｜User｜>Five people were eating apples, A finished before B, but behind C. D finished before E, but behind B. What was the finishing order?**\n\n**Solution:**\n\nLet's break down the information step by step.\n\n1. **A finished before B.**\n   - So, the order is: A → B\n\n2. **A was behind C.**\n   - So, C → A → B\n\n3. **D finished before E.**\n   - So, D → E\n\n4. **D was behind B.**\n   - So, B → D → E\n\nNow, let's combine these pieces of information.\n\n- From point 2: C → A → B\n- From point 4: B → D → E\n\nCombining these, we get: C → A → B → D → E\n\nSo, the finishing order is **C, A, B, D, E**.\n\n**Final Answer**\nThe finishing order is \\boxed{C, A, B, D, E}.\n\n---\n\n**Problem 2:**  \n**Seven people were eating apples. A finished before B, but behind C. D finished before E, but behind B. F finished before G, but behind D. What was the finishing order?**\n\n**Solution:**\n\nLet's break down the information step by step.\n\n1. **A finished before B.**\

# Biznes

In [None]:
biz_prompt = """
Act as a visionary strategist. For launching a product for cleaning unstructured data, define its *Why*, *How*, and *What* using Simon Sinek’s framework.
"""

generate(biz_prompt)

- **Why**: What is the problem? What are the opportunities? 
- **How**: What are the key steps to solve the problem? 
- **What**: What are the core competencies needed to execute the strategy?

**Problem Statement**: The market for cleaning unstructured data is underdeveloped. There's a lack of standardized tools and a shortage of skilled professionals. Additionally, the market is fragmented with no dominant players, making it difficult to capture the market share.

**Opportunities**: 
- The rise of big data and analytics is creating a demand for solutions.
- There's a growing need for data cleaning and preparation as more organizations move to big data.
- The market is fragmented, which can be leveraged to create a niche.
- There's a high demand for skilled professionals in data cleaning and preparation.

**Key Steps to Solve the Problem**:
1. Develop a standardized tool or framework for data cleaning.
2. Train and hire skilled professionals in data cleaning and preparation.
3. Levera

['<｜begin▁of▁sentence｜><｜User｜>\nAct as a visionary strategist. For launching a product for cleaning unstructured data, define its *Why*, *How*, and *What* using Simon Sinek’s framework. \n- **Why**: What is the problem? What are the opportunities? \n- **How**: What are the key steps to solve the problem? \n- **What**: What are the core competencies needed to execute the strategy?\n\n**Problem Statement**: The market for cleaning unstructured data is underdeveloped. There\'s a lack of standardized tools and a shortage of skilled professionals. Additionally, the market is fragmented with no dominant players, making it difficult to capture the market share.\n\n**Opportunities**: \n- The rise of big data and analytics is creating a demand for solutions.\n- There\'s a growing need for data cleaning and preparation as more organizations move to big data.\n- The market is fragmented, which can be leveraged to create a niche.\n- There\'s a high demand for skilled professionals in data cleanin

In [None]:
biz_prompt = """
Act as a skill-acquisition scientist. For data enginering:
1. Map its learning curve stages (novice → expert).
2. Identify **key bottlenecks** at each stage.
"""

generate(biz_prompt)

3. Provide **solutions** for each bottleneck. 

For data science: 
1. Map its learning curve stages (novice → expert). 
2. Identify **key bottlenecks** at each stage. 
3. Provide **solutions** for each bottleneck.

I need to write this in a structured way, using markdown, with clear sections and headers. The content should be concise, focused, and actionable. I should avoid any markdown in the thinking section, but in the final output, I need to include markdown as specified.

Alright, so I need to create two sections: one for Data Engineering and one for Data Science. Each section has three parts: mapping the learning curve, identifying bottlenecks, and providing solutions. 

Starting with Data Engineering. 

First, I need to outline the learning curve stages from Novice to Expert. For a novice, they probably don't have much hands-on experience. They might not understand the basics of tools and processes. So, the first stage is Novice, where they start with understanding the basics. 


["<｜begin▁of▁sentence｜><｜User｜>\nAct as a skill-acquisition scientist. For data enginering: \n1. Map its learning curve stages (novice → expert). \n2. Identify **key bottlenecks** at each stage. \n3. Provide **solutions** for each bottleneck. \n\nFor data science: \n1. Map its learning curve stages (novice → expert). \n2. Identify **key bottlenecks** at each stage. \n3. Provide **solutions** for each bottleneck.\n\nI need to write this in a structured way, using markdown, with clear sections and headers. The content should be concise, focused, and actionable. I should avoid any markdown in the thinking section, but in the final output, I need to include markdown as specified.\n\nAlright, so I need to create two sections: one for Data Engineering and one for Data Science. Each section has three parts: mapping the learning curve, identifying bottlenecks, and providing solutions. \n\nStarting with Data Engineering. \n\nFirst, I need to outline the learning curve stages from Novice to Expe