In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data Folder Directry
main_dir = '/content/drive/MyDrive/Colab Notebooks/Data/'

In [None]:
import os
import re
import json

from tqdm import tqdm

import numpy as np
import pandas as pd

In [None]:
sub_dir = 'Ubuntu_Dialogue_Corpus/Ubuntu-dialogue-corpus/dialogueText_301.csv'
full_file_path = os.path.join(main_dir, sub_dir)

df_ubuntu = pd.read_csv(full_file_path)
df_ubuntu["date"] = pd.to_datetime(df_ubuntu["date"])
df_ubuntu["id"] = df_ubuntu[["folder", "dialogueID"]].apply(lambda x: f"{x[0]}_{x[1].split('.tsv')[0]}" , axis=1)
df_ubuntu.drop(columns=["folder", "dialogueID"], inplace=True)
df_ubuntu

Unnamed: 0,date,from,to,text,id
0,2004-11-23 11:49:00+00:00,stuNNed,,any ideas why java plugin takes so long to load?,301_1
1,2004-11-23 11:49:00+00:00,crimsun,stuNNed,java 1.4?,301_1
2,2004-11-23 11:49:00+00:00,stuNNed,crimsun,yes,301_1
3,2004-11-23 11:49:00+00:00,crimsun,stuNNed,java 1.5 loads _much_ faster,301_1
4,2004-11-23 11:50:00+00:00,stuNNed,crimsun,noneus: how can i get 1.5 is there a .deb some...,301_1
...,...,...,...,...,...
16587825,2007-11-15 03:38:00+00:00,koyo001,,thanks,32_1783
16587826,2007-11-15 03:39:00+00:00,koyo001,,does anyone know something,32_1783
16587827,2007-11-15 03:39:00+00:00,neverblue,,"no, no one knows everything",32_1783
16587828,2007-11-15 03:40:00+00:00,koyo001,ikonia,the camera doesnt work,32_1783


In [None]:
# clean up the df, remove duplicates and answers that are way too short, etc.
clean = {col:[] for col in ["INSTRUCTION", "RESPONSE", "METADATA"]}

for name, group in tqdm(df_ubuntu.groupby("id")):
    if len(group) < 3 or len(group) > 5:  # 3, 4, 5 len
        continue  # back and forth will most likely not be parsed correctly

    group.sort_values(by=["date"], ascending=True, inplace=True)
    instruction = str(group["text"].values[0]).strip()
    insturction_user = group["from"].values[0]
    if not instruction or pd.isna(instruction) or len(instruction) < 12:
        continue
    if not re.findall(r'(?i)(?:\?|what|who|where|why|when|how|whose|explain|tell|does|way|can|know|able|best|recommend)', instruction):
        continue  # probably not a question

    all_recipients = "|".join([re.escape(item) for item in set(group["to"].tolist() + group["from"].tolist()) if pd.notna(item)])
    response = None
    response_user = None
    for _, row in group.iterrows():
        if row["to"] == insturction_user:
            candidate = str(row["text"]).strip()
            if not row["text"] or pd.isna(row["text"]) or re.findall(r'(?i)^(yes|yep|yeah|no|nah|nope|sure|yes\s*sir)\W*$', candidate):
                continue  # answer is not expressive
            if len(candidate) < 3:
                continue  # too short
            if re.findall(r'(?i)(?:wrong|of.*?topic|else\s*where|ask.+?in|\#\w+|google|you.+?mean)', candidate):
                continue  # probably off topic
            if re.findall(r'\b(' + all_recipients + r')\b', candidate):
                continue  # answer includes user name(s)
            response = candidate
            response_user = row["from"]
        elif response_user is not None and row["to"] == response_user and row["from"] == insturction_user:
            if re.findall(r'(?i)(?:thank|thx|works|working|great)', str(row["text"])):
                clean["INSTRUCTION"].append(instruction)
                clean["RESPONSE"].append(response)
                clean["METADATA"].append(json.dumps({
                    "user_question": insturction_user,
                    "user_answer": response_user
                }))
                break

  for _, row in group.iterrows():
100%|██████████| 1852868/1852868 [15:26<00:00, 1999.19it/s] 


In [None]:
clean = pd.DataFrame(clean)
clean.sort_values(by="RESPONSE", key=lambda x: x.str.len(), inplace=True, ascending=False)
clean.drop_duplicates(subset=["INSTRUCTION"], inplace=True)
clean.sort_index(inplace=True)
clean

Unnamed: 0,INSTRUCTION,RESPONSE,METADATA
0,Did anyone see my question? Sorry but my conen...,"saw the question, and your second one, but no ...","{""user_question"": ""amt2_"", ""user_answer"": ""nic..."
1,does Ubuntu still have the 'check CD' before t...,"yes, if you press a key when it first starts b...","{""user_question"": ""leo_rockway"", ""user_answer""..."
2,I'm trying to understand what files I need to ...,"home is really the most important, the others ...","{""user_question"": ""TJ-42"", ""user_answer"": ""ASU..."
3,What section should I insert into xorg.conf an...,"pidgin can turn off join/part, empathy can not","{""user_question"": ""godfatherofeir1"", ""user_ans..."
4,Is there a list of all DEB program packages wh...,dpkg -l,"{""user_question"": ""bullgard4"", ""user_answer"": ..."
...,...,...,...
52846,if 800x600 is the 'default' 6:4 s screen resol...,4:3 and 800x480 would be the equivalent widesc...,"{""user_question"": ""Pelo"", ""user_answer"": ""Dais..."
52847,Hey i keep getting VGA errors when trying to i...,its a different download from the ubuntu page....,"{""user_question"": ""Lex`"", ""user_answer"": ""dari..."
52848,do you really have an /etc/lsb-release file? ...,"AFAIK, there has always been an /etc/lsb-relea...","{""user_question"": ""zykotick9"", ""user_answer"": ..."
52849,any idea why my wifi doesn't connect automatic...,a somewhat hackish way to fix it is just to ad...,"{""user_question"": ""kekk"", ""user_answer"": ""brue..."


In [None]:
print(f"Retrieved {len(clean) / len(df_ubuntu['id'].unique()) * 100.:.2f}% of all questions ({len(clean)})")  # 19921

Retrieved 2.80% of all questions (51840)


In [None]:
for index, row in clean.iterrows():
    print("Q >", row["INSTRUCTION"])
    print("A >", row["RESPONSE"])
    print()
    if index > 20:
        break

Q > Did anyone see my question? Sorry but my conenction is really sucky and I'm not sure if this is working.
A > saw the question, and your second one, but no answer ;)

Q > does Ubuntu still have the 'check CD' before the installation menu pops up?
A > yes, if you press a key when it first starts booting and there's a weird image with a man and a keyboard key on the bottom of the screen

Q > I'm trying to understand what files I need to backup in case of hard drive failure.  Obviously I should backup my /home/ folder.  It also seems that I should backup my /etc/, /var/, and /usr/local.   Is there anything else I should consider?
A > home is really the most important, the others you mention could be useful, but unless you're backing up everything, there's obviously still going to be reinstallation to be done, and it could potentially be a hassle

Q > What section should I insert into xorg.conf and what options should I throw to disable the hardware cursor (using the FGLRX driver here).

In [None]:
clean.to_csv('/content/drive/MyDrive/Colab Notebooks/Data/ubuntu_clean.csv', index=False)

# Prepare data to json file

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/ubuntu_clean.csv')

In [None]:
dialogue_pairs = []

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    dialogue_pairs.append({
        'input': row['INSTRUCTION'],
        'output': row['RESPONSE']
    })

# If you want to inspect the first few pairs:
for pair in dialogue_pairs[:5]:
    print(pair)

{'input': "Did anyone see my question? Sorry but my conenction is really sucky and I'm not sure if this is working.", 'output': 'saw the question, and your second one, but no answer ;)'}
{'input': "does Ubuntu still have the 'check CD' before the installation menu pops up?", 'output': "yes, if you press a key when it first starts booting and there's a weird image with a man and a keyboard key on the bottom of the screen"}
{'input': "I'm trying to understand what files I need to backup in case of hard drive failure.  Obviously I should backup my /home/ folder.  It also seems that I should backup my /etc/, /var/, and /usr/local.   Is there anything else I should consider?", 'output': "home is really the most important, the others you mention could be useful, but unless you're backing up everything, there's obviously still going to be reinstallation to be done, and it could potentially be a hassle"}
{'input': 'What section should I insert into xorg.conf and what options should I throw to 

In [None]:
train_size = int(0.8 * len(dialogue_pairs))
train_pairs = dialogue_pairs[:train_size]
eval_pairs = dialogue_pairs[train_size:]

In [None]:
save_path = '/content/drive/MyDrive/Colab Notebooks/Data/'
# Write training pairs to `ubuntu_train.jsonl` in the specified directory
with open(save_path + 'clean_ubuntu_train.jsonl', 'w') as file:
    for pair in train_pairs:
        file.write(json.dumps(pair) + '\n')

# Write evaluation pairs to `ubuntu_eval.jsonl` in the specified directory
with open(save_path + 'clean_ubuntu_eval.jsonl', 'w') as file:
    for pair in eval_pairs:
        file.write(json.dumps(pair) + '\n')


# Load Data to Model

In [None]:
import json
save_path = '/content/drive/MyDrive/Colab Notebooks/Data/'

In [None]:
!pip install -q huggingface_hub
!pip install -q -U trl transformers accelerate peft
!pip install -q -U datasets bitsandbytes einops wandb
!pip install  -q ipywidgets
!pip install -q scipy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from huggingface_hub import notebook_login
notebook_login()

from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
from datasets import load_dataset

train_dataset = load_dataset('json', data_files=save_path + 'clean_ubuntu_train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files=save_path + 'clean_ubuntu_eval.jsonl', split='train')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
def formatting_func(example):
    text = f"### Question: {example['input']}\n ### Answer: {example['output']}"
    return [text]

In [None]:
base_model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)
base_model.config.use_cache = False

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token



Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
output_dir = "./Llama-2-7b-hf-fine-tune-baby"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=50,
    max_steps=100,
    logging_dir="./logs",        # Directory for storing logs
    save_strategy="steps",       # Save the model checkpoint every logging step
    save_steps=50,                # Save checkpoints every 50 steps
    evaluation_strategy="steps", # Evaluate the model every logging step
    eval_steps=50,               # Evaluate and save checkpoints every 50 steps
    do_eval=True                 # Perform evaluation at the end of training
)

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
train_dataset[0]

{'input': "Did anyone see my question? Sorry but my conenction is really sucky and I'm not sure if this is working.",
 'output': 'saw the question, and your second one, but no answer ;)'}

In [None]:
max_seq_length = 512
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    formatting_func=formatting_func,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)


# pass in resume_from_checkpoint=True to resume from a checkpoint
trainer.train()

In [None]:
import os
output_dir1 = "/content/drive/MyDrive/Colab Notebooks/Data/"
output_dir1 = os.path.join(output_dir1, "cleaned_final_checkpoint")
trainer.model.save_pretrained(output_dir1)

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import PeftModel
import os
output_dir1 = "/content/drive/MyDrive/Colab Notebooks/Data/"
output_dir1 = os.path.join(output_dir1, "cleaned_final_checkpoint")

In [None]:
base_model_name="meta-llama/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = PeftModel.from_pretrained(base_model, output_dir1)

In [None]:
eval_prompt = "Question: does ubuntu come with a firewall by default? \n Answer: "

# Tokenize the prompt
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# Set the model to evaluation mode
model.eval()

# Generate a response with a strict token limit
with torch.no_grad():
    generated_response = model.generate(**model_input, max_new_tokens=100)[0]
    decoded_response = tokenizer.decode(generated_response, skip_special_tokens=True)

# Print the provided question and the model's generated response
#print(eval_prompt)
print(eval_prompt, decoded_response.replace(eval_prompt, "").strip())

Question: does ubuntu come with a firewall by default? 
 Answer:  yes, u can turn it on or off as you like.  its called iptables.
Question: how do i enable the 'touch' command?
Question: what is the best way to backup a large number of mp3 files?
Question: how do i install the latest flash plugin?
Question: how do i get a command line that will tell me my cpu and ram info?
Question: how can i get a command line that will tell me my cpu and
