In [66]:
import os
import pandas as pd
os.listdir("../data/2nd_finetune/")

['RAFT',
 'fen_parsing.txt',
 'capture_explanation.csv',
 'final',
 '.ipynb_checkpoints']

# RAFT

In [67]:
raft_df = pd.DataFrame(columns=["Question", "Context", "Answer"])

In [68]:
os.listdir("../data/2nd_finetune/RAFT/")

['raft.csv',
 '.ipynb_checkpoints',
 'questions.txt',
 'raft.txt',
 'raft_o_c.txt',
 'raft_some_o_c.txt']

In [69]:
# context
from typing import List
from llama_index.core import SimpleDirectoryReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def get_doc_chunk(file_path: str, separators: List[str] = ["\n\n", "\n"]) -> List[str]:
    docs = SimpleDirectoryReader(file_path).load_data()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        separators = separators,
        is_separator_regex = False
        )
    
    # we have a single document only
    chunks = text_splitter.split_text(docs[0].text)
    
    return [chunk.replace("\n", " ") for chunk in chunks]

docs = get_doc_chunk("../RAFT/text/")

In [70]:
len(docs)

18

### 1 relevant context

In [71]:
raft_main = open("../data/2nd_finetune/RAFT/raft.txt", "r").read()

In [72]:
# Splitting the file content into multiple entries based on the delimiter '=====\n'
entries = raft_main.split('=====\n')

# Initialize empty lists to store questions, answers, contexts, and indices
questions = []
answers = []
contexts = []

# need to process entries in chunks of 4 parts (question, answer, context, index)
for i in range(0, len(entries) - 1, 4):
    try:
        # Extract and strip the parts
        question = entries[i].strip()
        answer = entries[i + 1].strip()
        index = int(entries[i + 3].strip())

        # Append the parts to the respective lists
        questions.append(question)
        answers.append(answer)
        contexts.append(docs[index])
    except (IndexError, ValueError) as e:
        print(f"Skipping malformed entry at index {i}: {e}")

In [73]:
len(questions), len(contexts), len(answers)

(36, 36, 36)

In [74]:
data = {
    "Question": questions,
    "Context": contexts,
    "Answer": answers
}

raft_main_df = pd.DataFrame(data)

In [75]:
raft_main_df.head()

Unnamed: 0,Question,Context,Answer
0,"Who are Tardo and Peo in ""DISQUALIFIED"" by Cha...",DISQUALIFIED BY CHARLES L. FONTENAY After t...,"To determine who Tardo and Peo are in ""DISQUAL..."
1,What is the significance of the castle overloo...,DISQUALIFIED BY CHARLES L. FONTENAY After t...,To determine the significance of the castle ov...
2,Who entertained Tardo and Peo at luncheon?,"After the morning inspection tour, Tardo, the ...","To answer the question ""Who entertained Tardo ..."
3,What was served for dessert?,"After the morning inspection tour, Tardo, the ...","To determine what was served for dessert, we n..."
4,What technical aid is available aboard the ship?,"""My recommendation will be of considerable imp...","To answer the question ""What technical aid is ..."


### out of context

In [76]:
o_c_raft = open("../data/2nd_finetune/RAFT/raft_o_c.txt").read()

In [77]:
entries = o_c_raft.split('=====\n')

questions = []
answers = []
contexts = []

for i in range(0, len(entries) - 1, 3):
    try:
        question = entries[i].strip()
        answer = entries[i + 1].strip()
        context = entries[i + 2].strip()

        questions.append(question)
        answers.append(answer)
        contexts.append(context)
    except (IndexError, ValueError) as e:
        print(f"Skipping malformed entry at index {i}: {e}")

In [78]:
data = {
    "Question": questions,
    "Context": contexts,
    "Answer": answers
}

raft_o_c = pd.DataFrame(data)

In [79]:
raft_o_c.head()

Unnamed: 0,Question,Context,Answer
0,What is the capital of France?,DISQUALIFIED BY CHARLES L. FONTENAY After t...,The context provided is a science fiction stor...
1,What is the capital of Japan?,"After the morning inspection tour, Tardo, the ...",The context provided does not directly mention...
2,What is the capital of Canada?,"""My recommendation will be of considerable imp...",The context provided does not contain any rele...
3,What is the capital of Australia?,"""I'm afraid our culture is too simple and agra...",The context provided does not contain any rele...
4,What is the capital of Azerbaijan?,"""We really feel that we have done well since w...",The context provided does not contain any rele...


### some context

In [80]:
some_o_c_raft = open("../data/2nd_finetune/RAFT/raft_some_o_c.txt").read()

In [81]:
entries = some_o_c_raft.split('=====\n')

questions = []
answers = []
contexts = []

for i in range(0, len(entries) - 1, 3):
    try:
        question = entries[i].strip()
        context = entries[i + 1].strip()
        answer = entries[i + 2].strip()

        questions.append(question)
        answers.append(answer)
        contexts.append(context)
    except (IndexError, ValueError) as e:
        print(f"Skipping malformed entry at index {i}: {e}")

In [82]:
data = {
    "Question": questions,
    "Context": contexts,
    "Answer": answers
}

raft_some_o_c = pd.DataFrame(data)

In [83]:
raft_some_o_c.head()

Unnamed: 0,Question,Context,Answer
0,"Who are Tardo and Peo in ""DISQUALIFIED"" by Cha...",DISQUALIFIED BY CHARLES L. FONTENAY After t...,"Given the context provided, Tardo and Peo are ..."
1,What is the significance of the castle overloo...,DISQUALIFIED BY CHARLES L. FONTENAY After t...,"Given the context provided, the significance o..."
2,Who entertained Tardo and Peo at luncheon?,"After the morning inspection tour, Tardo, the ...","Given the context provided, we can see that Ta..."
3,What was served for dessert?,"After the morning inspection tour, Tardo, the ...","To answer the question ""What was served for de..."
4,What technical aid is available aboard the ship?,"""My recommendation will be of considerable imp...",The context provided indicates that there is c...


# raft main

In [84]:
raft_df = pd.concat([raft_main_df, raft_o_c, raft_some_o_c], ignore_index=True).sample(frac=1).reset_index(drop=True)

In [85]:
raft_df.head(10)

Unnamed: 0,Question,Context,Answer
0,What were the main difficulties faced by the c...,"""You seem to have been lucky, though,"" said Pe...",To answer the question about the main difficul...
1,What factors would Tardo consider significant ...,Peo tried to notice what he thought Tardo woul...,The context provides information about Peo try...
2,Who typically works the fields in the village?,"Tardo asked about the fields. ""I see there is...","To answer the question ""Who typically works th..."
3,Where did the concept of a union come from in ...,"""They are paid,"" answered Saranta, and added r...",To answer the question of where the concept of...
4,What is the reason for the lack of workers in ...,"Tardo asked about the fields. ""I see there is...",The context provided describes a conversation ...
5,What is the significance of the castle overloo...,DISQUALIFIED BY CHARLES L. FONTENAY After t...,"Given the context provided, the significance o..."
6,What were Tardo and Peo doing as Alpha Persei ...,Alpha Persei was sinking in the western sky wh...,"To answer the question, we need to focus on th..."
7,What is the only piece that can jump over othe...,"Saranta apologized for their having to walk, e...",The context provided does not directly mention...
8,What was Tardo most intent on?,Tardo had seemed most intent on the question o...,"To answer the question ""What was Tardo most in..."
9,What was the challenge faced by colonizing shi...,"""If you haven't seen them, how do you know the...","Given the context provided, the challenge face..."


In [86]:
raft_df.shape

(96, 3)

In [87]:
raft_df.to_csv("raft.csv", index=False)

# capture explanation

In [88]:
capture_df = pd.read_csv("../data/2nd_finetune/capture_explanation.csv", index_col=0)
capture_df.head()

Unnamed: 0,moves,game_length,capture_explanantion
0,d4 f5 Bf4 Nf6 e3 d6 Nc3 Nc6 Bc4 e5 dxe5 dxe5 Q...,31,"\nWhite: d4, Black: f5\n(no capture at White: ..."
1,e4 e5 Nf3 f5 d3 fxe4 dxe4 Bb4+ Bd2 Bxd2+ Nbxd2...,43,"\nWhite: e4, Black: e5\n(no capture at White: ..."
2,e4 c5 Nf3 g6 d4 cxd4 Nxd4 Nc6 Nxc6 bxc6 Nc3 Bg...,14,"\nWhite: e4, Black: c5\n(no capture at White: ..."
3,d4 Nf6 c4 g6 Nc3 Bg7 e4 d6 f3 O-O Be3 e5 d5 c6...,40,"\nWhite: d4, Black: Nf6\n(no capture at White:..."
4,e4 e5 d3 Nf6 Bg5 Nc6 Bxf6 gxf6 a3 Qe7 b4 d6 c4...,19,"\nWhite: e4, Black: e5\n(no capture at White: ..."


In [89]:
capture_df["moves"] = "How many chess pieces have been captured in the following game provided in algebraic notation - " + capture_df["moves"]

In [90]:
capture_df.loc[0]["moves"]

'How many chess pieces have been captured in the following game provided in algebraic notation - d4 f5 Bf4 Nf6 e3 d6 Nc3 Nc6 Bc4 e5 dxe5 dxe5 Qxd8+ Kxd8 O-O-O+ Bd6 Bg5 Ke7 Nd5+ Kf8 Nxf6 gxf6 Bxf6 Rg8 Bxg8 Kxg8 Nf3 Kf7 Bh4 Be6 a3 b5 h3 b4 axb4 Bxb4 Bg3 Bd6 Rd2 a5 Rhd1 f4 exf4 exf4 Bh4 a4 Ng5+ Kf6 Ne4+ Ke5 Nxd6 cxd6 Rxd6 a3 Rxc6 axb2+ Kxb2 Ra2+ Kc3 Ra3+ Kb4'

In [91]:
capture_df["Context"] = ""
capture_df.drop(columns=["game_length"], inplace=True)

In [92]:
capture_df.head()

Unnamed: 0,moves,capture_explanantion,Context
0,How many chess pieces have been captured in th...,"\nWhite: d4, Black: f5\n(no capture at White: ...",
1,How many chess pieces have been captured in th...,"\nWhite: e4, Black: e5\n(no capture at White: ...",
2,How many chess pieces have been captured in th...,"\nWhite: e4, Black: c5\n(no capture at White: ...",
3,How many chess pieces have been captured in th...,"\nWhite: d4, Black: Nf6\n(no capture at White:...",
4,How many chess pieces have been captured in th...,"\nWhite: e4, Black: e5\n(no capture at White: ...",


In [93]:
capture_df["capture_explanantion"] = "Analysing each move pair -" + capture_df["capture_explanantion"]

In [94]:
data = {
    "Question" : capture_df["moves"],
    "Context" : capture_df["Context"],
    "Answer" : capture_df["capture_explanantion"] 
}

In [95]:
capture = pd.DataFrame(data)

In [96]:
capture.head()

Unnamed: 0,Question,Context,Answer
0,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: d4, Black: ..."
1,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: e4, Black: ..."
2,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: e4, Black: ..."
3,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: d4, Black: ..."
4,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: e4, Black: ..."


In [97]:
capture.shape

(50, 3)

In [98]:
capture.to_csv("capture.csv", index = False)

# FEN

In [99]:
fen_file = open("../data/2nd_finetune/fen_parsing.txt", "r").read()
entries = fen_file.split("-----\n")

In [100]:
print(entries[0])

4n3/8/8/8/4B3/1K2kr2/8/1q6



In [101]:
print(entries[1])

Empty chess board square is denoted by '1'
Total 6 pieces present in the board

            White Pieces: 2 pieces which are 1 x White Bishop, 1 x White King

            Black Pieces: 4 pieces which are 1 x Black Knight, 1 x Black King, 1 x Black Rook, 1 x Black Queen
            
Board State is:
 1 1 1 1 n 1 1 1
 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1
 1 1 1 1 B 1 1 1
 1 K 1 1 k r 1 1
 1 1 1 1 1 1 1 1
 1 q 1 1 1 1 1 1

Piece Positions are:
Black Knight at e8
White Bishop at e4
White King at b3
Black King at e3
Black Rook at f3
Black Queen at b1




In [102]:
fen_explanation = '''Analysing the FEN line where each chessboard row is divided by '/'.
Numbers represent empty squares. For example, "4" means 4 empty squares.
Each letter represents a chess piece: lowercase letters represent black pieces and uppercase letters represent white pieces.
For example, 'K' is a White King, 'N' is  and 'q' is a Black Queen.
The columns are known as files range from 1-8.
The rows are known as ranks range from a-h.
e.g. a White Knight at rank "a" and file "3" will be N at a3.

'''

In [103]:
fen_question = "Analyse the provided FEN and explain which chess pieces are present in the board - "

In [104]:
questions = []
answers = []

for i in range(0, len(entries)-1, 2):
    questions.append(fen_question + entries[i].strip())
    answers.append(fen_explanation + entries[i+1].strip())

data = {
    "Question" : questions,
    "Context" : "",
    "Answer" : answers
}
fen = pd.DataFrame(data)

In [105]:
fen.head()

Unnamed: 0,Question,Context,Answer
0,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...
1,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...
2,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...
3,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...
4,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...


In [106]:
fen.shape

(50, 3)

In [107]:
fen.to_csv("fen.csv", index=False)

# 1st finetune

In [108]:
finetune_1 = pd.read_csv("../data/finetune/diverse_task_3.csv", index_col=0)
finetune_1.head()

Unnamed: 0,moves,instruction,explanation
0,e4 e6 d4 d5 Nd2 c5 exd5 exd5 dxc5 Bxc5 Nb3 Bb6...,Assume you are a chess master. Who do you thin...,"White. The game dynamics favored White, who ef..."
1,e4 d5 exd5 Qxd5 Nc3 Qe5+ Be2 Na6 d4 Qf5 Bxa6 b...,Assume you are a chess master. Suggest the nex...,Qg7#
2,e4 d5 exd5 Qxd5 Nc3 Qe5+ Be2 Na6 d4 Qf5 Bxa6 b...,Assume you are a chess master. Suggest the nex...,Rd2+
3,d4 d5 Nf3 c5 c3 c4 Bf4 Nc6 e3 e6 Nbd2 Bd6 Bg3 ...,Assume you are a chess master. Who do you thin...,"White, achieved through superior tactical expl..."
4,e4 e5 Bc4 Nf6 Nc3 Bc5 Nf3 O-O O-O d6 d4 exd4 N...,Assume you are a chess master. Who do you thin...,"ReasonWhite emerged as the winner, largely due..."


In [109]:
questions = finetune_1["instruction"] + " - " + finetune_1["moves"]
questions[0]

'Assume you are a chess master. Who do you think will win the game based on the provided chess moves in Algebraic Notation. - e4 e6 d4 d5 Nd2 c5 exd5 exd5 dxc5 Bxc5 Nb3 Bb6 Nf3 Nf6 Bd3 Nc6 c3 h6 O-O O-O Bf4 Be6 Qd2 Ne4 Qc2 f5 Nbd4 Nxd4 Nxd4 Bd7 Rfe1 Rc8 Qb3 Nc5 Qxd5+ Kh8 Bc2 Ne4 Bxe4 fxe4 Qxe4 Bxd4 Qxd4 Bc6 Qxd8 Rcxd8 Bg3 Rd2 b4 Rc2 Re3 Rd8 a3 Rdd2 h4 g6 Rae1 Ra2 c4 a6 c5 Kg8 Kf1 Bb5+ Kg1 Bc6 Re6 Kf7 R1e3 h5 Rd6 Re2 Rxe2 Rxe2 f3 Ra2 Rd3 Ke6 Kf1 Bb5 c6 Bxd3+ Kg1 bxc6'

In [110]:
answers = finetune_1["explanation"]

In [111]:
finetune_3_tasks = pd.DataFrame({"Question" : questions, "Context" : "", "Answer" : answers})

In [112]:
finetune_3_tasks.head()

Unnamed: 0,Question,Context,Answer
0,Assume you are a chess master. Who do you thin...,,"White. The game dynamics favored White, who ef..."
1,Assume you are a chess master. Suggest the nex...,,Qg7#
2,Assume you are a chess master. Suggest the nex...,,Rd2+
3,Assume you are a chess master. Who do you thin...,,"White, achieved through superior tactical expl..."
4,Assume you are a chess master. Who do you thin...,,"ReasonWhite emerged as the winner, largely due..."


In [113]:
finetune_3_tasks.to_csv("finetune_3_tasks.csv", index=False)

# no context

In [114]:
no_context = pd.concat([fen, capture, finetune_3_tasks], ignore_index=True).sample(frac=1).reset_index(drop=True)

In [115]:
no_context.shape

(400, 3)

In [116]:
no_context

Unnamed: 0,Question,Context,Answer
0,"Assume you are a chess master, explain the str...",,"### Game Analysis\n\n1. **White: d4, Black: d5..."
1,"Assume you are a chess master, explain the str...",,"**Game Analysis:**\n\n1. **White: c4, Black: e..."
2,Assume you are a chess master. Suggest the nex...,,Qf4
3,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: e4, Black: ..."
4,Assume you are a chess master. Suggest the nex...,,gxf4
...,...,...,...
395,,,d4
396,Assume you are a chess master. Who do you thin...,,BlackReason Black successfully navigated the s...
397,Assume you are a chess master. Suggest the nex...,,Rd2+
398,Assume you are a chess master. Who do you thin...,,"White demonstrated better tactical awareness, ..."


# add irrelevant context

In [142]:
irr_context = no_context.sample(frac = 0.2).reset_index(drop = True)

In [143]:
irr_context.head()

Unnamed: 0,Question,Context,Answer
0,Assume you are a chess master. Who do you thin...,,"The game, as described, doesn't clearly denote..."
1,"Assume you are a chess master, explain the str...",,"### Game Analysis\n\n1. **White: d4, Black: Nf..."
2,Assume you are a chess master. Suggest the nex...,,Ke6
3,Assume you are a chess master. Suggest the nex...,,f3
4,Assume you are a chess master. Who do you thin...,,AnalysisA clear winner isn't designated from t...


In [144]:
irr_context.shape

(80, 3)

In [145]:
import random
contexts = [docs[random.randint(0, 17)]for i in range(0, 80)]

In [146]:
irr_context["Context"] = contexts
irr_context.head()

Unnamed: 0,Question,Context,Answer
0,Assume you are a chess master. Who do you thin...,"""There will be no inspection tour tomorrow, an...","The game, as described, doesn't clearly denote..."
1,"Assume you are a chess master, explain the str...","Tardo laughed. ""A carry-over from Earth, no d...","### Game Analysis\n\n1. **White: d4, Black: Nf..."
2,Assume you are a chess master. Suggest the nex...,"""But you were able to solve this situation in ...",Ke6
3,Assume you are a chess master. Suggest the nex...,"""We really feel that we have done well since w...",f3
4,Assume you are a chess master. Who do you thin...,"""Saranta said that. But I don't see ..."" ""Tho...",AnalysisA clear winner isn't designated from t...


In [147]:
irr_context["Answer"] = irr_context.apply(lambda row : f'''The provided context - 
'{row["Context"]}'
does not mention anything relvant to answer the question -
'{row["Question"]}'
So ignoring the context.

Answer : ''' + row["Answer"], axis = 1)

In [148]:
irr_context.head()

Unnamed: 0,Question,Context,Answer
0,Assume you are a chess master. Who do you thin...,"""There will be no inspection tour tomorrow, an...","The provided context - \n'""There will be no in..."
1,"Assume you are a chess master, explain the str...","Tardo laughed. ""A carry-over from Earth, no d...","The provided context - \n'Tardo laughed. ""A c..."
2,Assume you are a chess master. Suggest the nex...,"""But you were able to solve this situation in ...","The provided context - \n'""But you were able t..."
3,Assume you are a chess master. Suggest the nex...,"""We really feel that we have done well since w...","The provided context - \n'""We really feel that..."
4,Assume you are a chess master. Who do you thin...,"""Saranta said that. But I don't see ..."" ""Tho...","The provided context - \n'""Saranta said that. ..."


In [149]:
print(irr_context.loc[0]["Answer"])

The provided context - 
'"There will be no inspection tour tomorrow, and I shall recommend against aid at this time," replied Tardo. "I've seen enough."  "Why?" asked Peo, surprised.  "There are two classes of people on this planet, and we've seen only one," said Tardo. "Those we have seen are freemen. The others are no better than animals. We give no aid that helps men tighten their hold over their fellows."'
does not mention anything relvant to answer the question -
'Assume you are a chess master. Who do you think will win the game based on the provided chess moves in Algebraic Notation. - d4 Nf6 c4 e6 Nf3 b6 g3 Bb7 Bg2 Be7 Nc3 O-O O-O Na6 a3 Rc8 b4 c5 bxc5 bxc5 d5 Ne8 Rb1 Nd6 Bf4 Ba8 Qa4 Nc7 Ne5 Bf6 Nxd7 Bxc3 Bxd6 Re8 Bxc5 exd5 e3 Ne6 Bxa7 Bc6 Qc2 Qxd7 cxd5 Bxd5 Bxd5 Qxd5 Rfd1 Qa5 Bb6 Qxa3 Rb3 Qa6 Rd6 Be5 Qd3 Qa1+ Kg2 Bxd6 Qxd6 Qa8+ Kg1 Rc1+'
So ignoring the context.

Answer : The game, as described, doesn't clearly denote a definitive winner but from the sequence, it looks like Bla

In [150]:
irr_context.to_csv("irr_context.csv", index=False)

# finetue _2

In [151]:
df = pd.concat([no_context, irr_context, raft_df], ignore_index=True).sample(frac = 1)
df.shape

(576, 3)

In [None]:
df.to_csv("finetune_2.csv", index = False)

# testing with smaller df

In [8]:
import pandas as pd
fen = pd.read_csv("fen.csv")
capture = pd.read_csv("capture.csv")
raft = pd.read_csv("raft.csv").sample(frac = 0.4)
old_fine_tune = pd.read_csv("finetune_3_tasks.csv").sample(frac=0.7)
short_df = pd.concat([fen, capture, raft, old_fine_tune], ignore_index= True).sample(frac=1)
short_df.shape

(348, 3)

In [9]:
short_df.head()

Unnamed: 0,Question,Context,Answer
283,"Assume you are a chess master, explain the str...",,"## Game Analysis\n\n1. **White: d4, Black: d5*..."
94,How many chess pieces have been captured in th...,,"Analysing each move pair -\nWhite: e4, Black: ..."
270,Assume you are a chess master. Suggest the nex...,,Qd7
28,Analyse the provided FEN and explain which che...,,Analysing the FEN line where each chessboard r...
253,Assume you are a chess master. Who do you thin...,,White. White successfully executed a checkmate...


In [10]:
short_df.to_csv("sub_finetune_2.csv", index = False)

In [3]:
import pandas as pd
pd.read_csv("sub_finetune_2.csv").shape

(348, 3)