In [1]:
import os
import pandas as pd
os.listdir("../data/2nd_finetune/")

['RAFT', 'fen_parsing.txt', 'capture_explanation.csv', '.ipynb_checkpoints']

# RAFT

In [2]:
raft_df = pd.DataFrame(columns=["Question", "Context", "Answer"])

In [3]:
os.listdir("../data/2nd_finetune/RAFT/")

['raft.csv',
 '.ipynb_checkpoints',
 'questions.txt',
 'raft.txt',
 'raft_o_c.txt',
 'raft_some_o_c.txt']

In [4]:
# context
from typing import List
from llama_index.core import SimpleDirectoryReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def get_doc_chunk(file_path: str, separators: List[str] = ["\n\n", "\n"]) -> List[str]:
    docs = SimpleDirectoryReader(file_path).load_data()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        separators = separators,
        is_separator_regex = False
        )
    
    # we have a single document only
    chunks = text_splitter.split_text(docs[0].text)
    
    return [chunk.replace("\n", " ") for chunk in chunks]

docs = get_doc_chunk("../RAFT/text/")

In [5]:
len(docs)

18

### 1 relevant context

In [46]:
raft_main = open("../data/2nd_finetune/RAFT/raft.txt", "r").read()

In [47]:
# Splitting the file content into multiple entries based on the delimiter '=====\n'
entries = raft_main.split('=====\n')

# Initialize empty lists to store questions, answers, contexts, and indices
questions = []
answers = []
contexts = []

# need to process entries in chunks of 4 parts (question, answer, context, index)
for i in range(0, len(entries) - 1, 4):
    try:
        # Extract and strip the parts
        question = entries[i].strip()
        answer = entries[i + 1].strip()
        index = int(entries[i + 3].strip())

        # Append the parts to the respective lists
        questions.append(question)
        answers.append(answer)
        contexts.append(docs[index])
    except (IndexError, ValueError) as e:
        print(f"Skipping malformed entry at index {i}: {e}")

In [48]:
len(questions), len(contexts), len(answers)

(36, 36, 36)

In [49]:
data = {
    "Question": questions,
    "Context": contexts,
    "Answer": answers
}

raft_main_df = pd.DataFrame(data)

In [50]:
raft_main_df.head()

Unnamed: 0,Question,Context,Answer
0,"Who are Tardo and Peo in ""DISQUALIFIED"" by Cha...",DISQUALIFIED BY CHARLES L. FONTENAY After t...,"To determine who Tardo and Peo are in ""DISQUAL..."
1,What is the significance of the castle overloo...,DISQUALIFIED BY CHARLES L. FONTENAY After t...,To determine the significance of the castle ov...
2,Who entertained Tardo and Peo at luncheon?,"After the morning inspection tour, Tardo, the ...","To answer the question ""Who entertained Tardo ..."
3,What was served for dessert?,"After the morning inspection tour, Tardo, the ...","To determine what was served for dessert, we n..."
4,What technical aid is available aboard the ship?,"""My recommendation will be of considerable imp...","To answer the question ""What technical aid is ..."


### out of context

In [51]:
o_c_raft = open("../data/2nd_finetune/RAFT/raft_o_c.txt").read()

In [52]:
entries = o_c_raft.split('=====\n')

questions = []
answers = []
contexts = []

for i in range(0, len(entries) - 1, 3):
    try:
        question = entries[i].strip()
        answer = entries[i + 1].strip()
        context = entries[i + 2].strip()

        questions.append(question)
        answers.append(answer)
        contexts.append(context)
    except (IndexError, ValueError) as e:
        print(f"Skipping malformed entry at index {i}: {e}")

In [53]:
data = {
    "Question": questions,
    "Context": contexts,
    "Answer": answers
}

raft_o_c = pd.DataFrame(data)

In [54]:
raft_o_c.head()

Unnamed: 0,Question,Context,Answer
0,What is the capital of France?,DISQUALIFIED BY CHARLES L. FONTENAY After t...,The context provided is a science fiction stor...
1,What is the capital of Japan?,"After the morning inspection tour, Tardo, the ...",The context provided does not directly mention...
2,What is the capital of Canada?,"""My recommendation will be of considerable imp...",The context provided does not contain any rele...
3,What is the capital of Australia?,"""I'm afraid our culture is too simple and agra...",The context provided does not contain any rele...
4,What is the capital of Azerbaijan?,"""We really feel that we have done well since w...",The context provided does not contain any rele...


### some context

In [55]:
some_o_c_raft = open("../data/2nd_finetune/RAFT/raft_some_o_c.txt").read()

In [56]:
entries = some_o_c_raft.split('=====\n')

questions = []
answers = []
contexts = []

for i in range(0, len(entries) - 1, 3):
    try:
        question = entries[i].strip()
        context = entries[i + 1].strip()
        answer = entries[i + 2].strip()

        questions.append(question)
        answers.append(answer)
        contexts.append(context)
    except (IndexError, ValueError) as e:
        print(f"Skipping malformed entry at index {i}: {e}")

In [58]:
data = {
    "Question": questions,
    "Context": contexts,
    "Answer": answers
}

raft_some_o_c = pd.DataFrame(data)

In [59]:
raft_some_o_c.head()

Unnamed: 0,Question,Context,Answer
0,"Who are Tardo and Peo in ""DISQUALIFIED"" by Cha...",DISQUALIFIED BY CHARLES L. FONTENAY After t...,"Given the context provided, Tardo and Peo are ..."
1,What is the significance of the castle overloo...,DISQUALIFIED BY CHARLES L. FONTENAY After t...,"Given the context provided, the significance o..."
2,Who entertained Tardo and Peo at luncheon?,"After the morning inspection tour, Tardo, the ...","Given the context provided, we can see that Ta..."
3,What was served for dessert?,"After the morning inspection tour, Tardo, the ...","To answer the question ""What was served for de..."
4,What technical aid is available aboard the ship?,"""My recommendation will be of considerable imp...",The context provided indicates that there is c...


In [60]:
raft_df = pd.concat([raft_main_df, raft_o_c, raft_some_o_c], ignore_index=True).sample(frac=1).reset_index(drop=True)

In [61]:
raft_df.head(10)

Unnamed: 0,Question,Context,Answer
0,What is the reason why the ship just rusted away?,"""We really feel that we have done well since w...",The context provided discusses a conversation ...
1,What was the reason for having to walk on the ...,Tardo had seemed most intent on the question o...,"To answer the question ""What was the reason fo..."
2,What was the reason for having to walk on the ...,Tardo had seemed most intent on the question o...,"Given the context provided, the reason for hav..."
3,Who is responsible for working the fields on t...,"Saranta apologized for their having to walk, e...","Given the context provided, Saranta mentioned ..."
4,"Who are Tardo and Peo in ""DISQUALIFIED"" by Cha...",DISQUALIFIED BY CHARLES L. FONTENAY After t...,"To determine who Tardo and Peo are in ""DISQUAL..."
5,Why did colonizing ships have to depend on nat...,"""If you haven't seen them, how do you know the...","To answer the question ""Why did colonizing shi..."
6,What is the capital of Bangladesh?,"""But you were able to solve this situation in ...",The context provided does not contain any info...
7,What is the reason for the lack of workers in ...,"Tardo asked about the fields. ""I see there is...",To answer the question about the reason for th...
8,What is the capital of Japan?,"After the morning inspection tour, Tardo, the ...",The context provided does not directly mention...
9,What did Saranta say?,"""Saranta said that. But I don't see ..."" ""Tho...","Given the context provided, Saranta's actual w..."


In [62]:
raft_df.shape

(96, 3)

In [64]:
raft_df.to_csv("raft.csv", index=False)

# capture explanation

In [120]:
capture_df = pd.read_csv("../data/2nd_finetune/capture_explanation.csv", index_col=0)
capture_df.head()

Unnamed: 0,moves,game_length,capture_explanantion
0,d4 f5 Bf4 Nf6 e3 d6 Nc3 Nc6 Bc4 e5 dxe5 dxe5 Q...,31,"\nWhite: d4, Black: f5\n(no capture at White: ..."
1,e4 e5 Nf3 f5 d3 fxe4 dxe4 Bb4+ Bd2 Bxd2+ Nbxd2...,43,"\nWhite: e4, Black: e5\n(no capture at White: ..."
2,e4 c5 Nf3 g6 d4 cxd4 Nxd4 Nc6 Nxc6 bxc6 Nc3 Bg...,14,"\nWhite: e4, Black: c5\n(no capture at White: ..."
3,d4 Nf6 c4 g6 Nc3 Bg7 e4 d6 f3 O-O Be3 e5 d5 c6...,40,"\nWhite: d4, Black: Nf6\n(no capture at White:..."
4,e4 e5 d3 Nf6 Bg5 Nc6 Bxf6 gxf6 a3 Qe7 b4 d6 c4...,19,"\nWhite: e4, Black: e5\n(no capture at White: ..."


In [121]:
capture_df["moves"] = "How many chess pieces have been captured in the following game provided in algebraic notation - " + capture_df["moves"]

In [122]:
capture_df.loc[0]["moves"]

'How many chess pieces have been captured in the following game provided in algebraic notation - d4 f5 Bf4 Nf6 e3 d6 Nc3 Nc6 Bc4 e5 dxe5 dxe5 Qxd8+ Kxd8 O-O-O+ Bd6 Bg5 Ke7 Nd5+ Kf8 Nxf6 gxf6 Bxf6 Rg8 Bxg8 Kxg8 Nf3 Kf7 Bh4 Be6 a3 b5 h3 b4 axb4 Bxb4 Bg3 Bd6 Rd2 a5 Rhd1 f4 exf4 exf4 Bh4 a4 Ng5+ Kf6 Ne4+ Ke5 Nxd6 cxd6 Rxd6 a3 Rxc6 axb2+ Kxb2 Ra2+ Kc3 Ra3+ Kb4'

In [123]:
capture_df["Context"] = ""
capture_df.drop(columns=["game_length"], inplace=True)

In [124]:
capture_df.head()

Unnamed: 0,moves,capture_explanantion,Context
0,How many chess pieces have been captured in th...,"\nWhite: d4, Black: f5\n(no capture at White: ...",
1,How many chess pieces have been captured in th...,"\nWhite: e4, Black: e5\n(no capture at White: ...",
2,How many chess pieces have been captured in th...,"\nWhite: e4, Black: c5\n(no capture at White: ...",
3,How many chess pieces have been captured in th...,"\nWhite: d4, Black: Nf6\n(no capture at White:...",
4,How many chess pieces have been captured in th...,"\nWhite: e4, Black: e5\n(no capture at White: ...",


In [125]:
capture_df["capture_explanantion"] = "Analysing each move pair -" + capture_df["capture_explanantion"]

In [126]:
data = {
    "Question" : capture_df["moves"],
    "Context" : capture_df["Context"],
    "Answer" : capture_df["capture_explanantion"] 
}

In [127]:
capture = pd.DataFrame(data)

In [128]:
capture.head()

Unnamed: 0,Question,Context,Answer
0,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: d4, Black: ..."
1,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: e4, Black: ..."
2,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: e4, Black: ..."
3,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: d4, Black: ..."
4,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: e4, Black: ..."


In [129]:
capture.to_csv("capture.csv", index = False)

# FEN

In [130]:
fen_file = open("../data/2nd_finetune/fen_parsing.txt", "r").read()
entries = fen_file.split("-----\n")

In [131]:
print(entries[0])

4n3/8/8/8/4B3/1K2kr2/8/1q6



In [132]:
print(entries[1])

Empty chess board square is denoted by '1'
Total 6 pieces present in the board

            White Pieces: 2 pieces which are 1 x White Bishop, 1 x White King

            Black Pieces: 4 pieces which are 1 x Black Knight, 1 x Black King, 1 x Black Rook, 1 x Black Queen
            
Board State is:
 1 1 1 1 n 1 1 1
 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1
 1 1 1 1 B 1 1 1
 1 K 1 1 k r 1 1
 1 1 1 1 1 1 1 1
 1 q 1 1 1 1 1 1

Piece Positions are:
Black Knight at e8
White Bishop at e4
White King at b3
Black King at e3
Black Rook at f3
Black Queen at b1




In [133]:
fen_explanation = '''Analysing the FEN line where each chessboard row is divided by '/'.
Numbers represent empty squares. For example, "4" means 4 empty squares.
Each letter represents a chess piece: lowercase letters represent black pieces and uppercase letters represent white pieces.
For example, 'K' is a White King, 'N' is  and 'q' is a Black Queen.
The columns are known as files range from 1-8.
The rows are known as ranks range from a-h.
e.g. a White Knight at rank "a" and file "3" will be N at a3.

'''

In [134]:
fen_question = "Analyse the provided FEN and explain which chess pieces are present in the board - "

In [135]:
questions = []
answers = []

for i in range(0, len(entries)-1, 2):
    questions.append(fen_question + entries[i].strip())
    answers.append(fen_explanation + entries[i+1].strip())

data = {
    "Question" : questions,
    "Context" : "",
    "Answer" : answers
}
fen = pd.DataFrame(data)

In [137]:
fen.head()

Unnamed: 0,Question,Context,Answer
0,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...
1,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...
2,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...
3,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...
4,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...


In [136]:
fen.to_csv("fen.csv", index=False)

# 1st finetune

In [6]:
finetune_1 = pd.read_csv("../data/finetune/diverse_task_3.csv", index_col=0)
finetune_1.head()

Unnamed: 0,moves,instruction,explanation
0,e4 e6 d4 d5 Nd2 c5 exd5 exd5 dxc5 Bxc5 Nb3 Bb6...,Assume you are a chess master. Who do you thin...,"White. The game dynamics favored White, who ef..."
1,e4 d5 exd5 Qxd5 Nc3 Qe5+ Be2 Na6 d4 Qf5 Bxa6 b...,Assume you are a chess master. Suggest the nex...,Qg7#
2,e4 d5 exd5 Qxd5 Nc3 Qe5+ Be2 Na6 d4 Qf5 Bxa6 b...,Assume you are a chess master. Suggest the nex...,Rd2+
3,d4 d5 Nf3 c5 c3 c4 Bf4 Nc6 e3 e6 Nbd2 Bd6 Bg3 ...,Assume you are a chess master. Who do you thin...,"White, achieved through superior tactical expl..."
4,e4 e5 Bc4 Nf6 Nc3 Bc5 Nf3 O-O O-O d6 d4 exd4 N...,Assume you are a chess master. Who do you thin...,"ReasonWhite emerged as the winner, largely due..."


In [7]:
questions = finetune_1["instruction"] + " - " + finetune_1["moves"]
questions[0]

'Assume you are a chess master. Who do you think will win the game based on the provided chess moves in Algebraic Notation. - e4 e6 d4 d5 Nd2 c5 exd5 exd5 dxc5 Bxc5 Nb3 Bb6 Nf3 Nf6 Bd3 Nc6 c3 h6 O-O O-O Bf4 Be6 Qd2 Ne4 Qc2 f5 Nbd4 Nxd4 Nxd4 Bd7 Rfe1 Rc8 Qb3 Nc5 Qxd5+ Kh8 Bc2 Ne4 Bxe4 fxe4 Qxe4 Bxd4 Qxd4 Bc6 Qxd8 Rcxd8 Bg3 Rd2 b4 Rc2 Re3 Rd8 a3 Rdd2 h4 g6 Rae1 Ra2 c4 a6 c5 Kg8 Kf1 Bb5+ Kg1 Bc6 Re6 Kf7 R1e3 h5 Rd6 Re2 Rxe2 Rxe2 f3 Ra2 Rd3 Ke6 Kf1 Bb5 c6 Bxd3+ Kg1 bxc6'

In [8]:
answers = finetune_1["explanation"]

In [None]:
finetune_3_tasks = pd.DataFrame({"Question" : questions, ""})