In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import re

In [2]:
galvin_dir = "../Data/Galvin_data"
tannenbaum_dir  = "../Data/Tanenbaum_data"

In [3]:
def list_directories(p_dir):
    # Get list of all entries in the directory
    entries = os.listdir(p_dir)
    
    # Filter out only directories
    directories = [entry for entry in entries if os.path.isdir(os.path.join(p_dir, entry))]
    
    return directories

In [4]:
def list_files(directory):
    # Get all entries in the directory
    entries = os.listdir(directory)
    # Filter out directories, keeping only files
    files = [entry for entry in entries if os.path.isfile(os.path.join(directory, entry))]
    return files

In [5]:
def format_str(s) -> str:
    
    if s.startswith("assistant:"):  # Check if the string starts with 'assistant '
        s = s.replace("assistant:", "<Reason>: ", 1)  # Replace the first occurrence

    start_index, end_index = 0, len(s) - 1
    beg_found = False
    for i in range(len(s)):
        if s[i].isalpha():
            if not beg_found:
                start_index = i
                beg_found = True
            else:
                end_index = i
    end_index += 2
    text = s[start_index-1 : min(end_index, len(s))]
    pattern = re.compile(r'''
            (\d+\.)? # optional digits and dot
            (\s+)? # optional whitespace
            (\*\*Final\ Answer\*\*|\*\*Final\ Answer:\*\*|\*\*Final\ answer\*\*|\*\*Final\ Answer:\*\*|Final\ Answer:|\*\*Answer\*\*|\*\*Answer:\*\*|Therefore,)
            (\s+)? # optional whitespace
            :? # optional colon
        ''', re.IGNORECASE | re.VERBOSE)
    
    
    text = re.sub(pattern, "\n<Answer>", text)
    
    match = re.search(r'<Answer>', text)
    
    return text

In [6]:
def replace_context(df):
    ctxs = df.context.tolist()
    oracles = df.oracle_context.tolist()
    for i,co in enumerate(zip(ctxs,oracles)):
        c,o = co
        c = c['sentences'][0]
        c.remove(o)
        c.insert(0,o)
    return df

In [7]:
def make_data(p_dir):
    size = list_directories(p_dir)
    df_final = pd.DataFrame()
    for i in range(len(size)):
        df = pd.DataFrame()
        s = size[i]
        data_dir = os.path.join(p_dir,s)
        file_list = list_files(data_dir)
        for file in tqdm(file_list, desc="Processing files"):
            direc = os.path.join(data_dir,file)
            temp = pd.read_json(direc,lines=True)
            df = pd.concat([df,temp])
            df.reset_index(inplace=True,drop=True)
        df['cot_answer'] = df['cot_answer'].apply(format_str)
        df_final = pd.concat([df_final,df])
        df_final.reset_index(inplace=True,drop=True)
    df_final = replace_context(df_final)
    
    return df_final

In [8]:
df_g = make_data(galvin_dir)
df_t = make_data(tannenbaum_dir)

Processing files: 100%|█████████████████████████| 65/65 [00:02<00:00, 32.16it/s]
Processing files: 100%|██████████████████████| 282/282 [00:02<00:00, 112.42it/s]
Processing files: 100%|██████████████████████| 106/106 [00:01<00:00, 104.60it/s]


In [9]:
df_final = pd.concat([df_g,df_t])
df_final.reset_index(inplace=True, drop=True)

In [10]:
for ctx in df_g.context:
    ls = ctx['sentences'][0]
    if(len(set(ls)) != len(ls)):
        print("Anomaly found !")

In [12]:
df_final.to_json("../Data/clean_data.json")

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
ctxs = df.context.tolist()
oracles = df.oracle_context.tolist()

In [9]:
for i,co in enumerate(zip(ctxs,oracles)):
    c,o = co
    c = c['sentences'][0]
    c.remove(o)

In [10]:
df_oracle, df_no_oracle = train_test_split(df,test_size=0.2)
df_oracle.reset_index(inplace=True,drop=True)
df_no_oracle.reset_index(inplace=True,drop=True)

In [11]:
ctxs = df_oracle.context.tolist()
oracles = df_oracle.oracle_context.tolist()

In [12]:
for c,o in zip(ctxs,oracles):
    c['sentences'][0][0] = o

In [13]:
df_oracle.to_json("Train_data/Tanenbaum_oracle.json")
df_no_oracle.to_json("Train_data/Tanenbaum_no_oracle.json")

In [22]:
df_oracle.cot_answer[600]

'<REASON>:  To answer the question about unique features of multiprocessor operating systems, we need to consider the context provided. \n\n1. **Definition of Multiprocessor Operating Systems**: The context defines multiprocessor operating systems as normal operating systems that handle system calls, memory management, file systems, and I/O devices. However, they have unique features in areas such as process synchronization, resource management, and scheduling.\n\n2. **Unique Features of Multiprocessor Operating Systems**:\n   - **Process Synchronization**: Multiprocessor operating systems need to manage synchronization between processes running on different CPUs to avoid conflicts and ensure data consistency.\n   - **Resource Management**: Efficient allocation and management of shared resources among multiple CPUs is crucial in multiprocessor systems to prevent bottlenecks and optimize performance.\n   - **Scheduling**: Multiprocessor operating systems require advanced scheduling algo

In [None]:
# Final cleaning

In [1]:
import pandas as pd

In [5]:
df_oracle = pd.concat([pd.read_json("../Data/Train_data/Galvin_no_oracle.json"),pd.read_json("../Data/Train_data/Tanenbaum_no_oracle.json")])

In [6]:
df_oracle = df_oracle[['question', 'context', 'cot_answer']]
df_oracle.reset_index(inplace=True,drop=True)

In [7]:
df_oracle.to_json("../Data/no_oracle_uncut.json")

In [102]:
def make_instruction(ctx, question,answer):
    docs = ctx['sentences'][0]
    chunk = " <DOCUMENT> ".join(docs)
    chunk = "<DOCUMENT> " + chunk
    chunk = chunk.split()
    chunk = " ".join(chunk[:3000])
    system_prompt = """
<<SYS>>
You are an AI assistant that helps answer questions based on provided context. Here are things to pay attention to:

- First provide step-by-step reasoning on how to answer the question in the form <REASON>: $reason
- In the reasoning, if you need to reference some sentences from the context, include them in ##begin_quote## and ##end_quote##.
- End your response with a final answer in the form <ANSWER>: $answer. The answer should be succinct.
- <DOCUMENT> tag in Context marks the beginning of a new document chunk.
<</SYS>>"""

    user_prompt = f"""
Answer this Question: {question}
using the Context: {chunk}
"""
    prompt = f"[INST] {system_prompt} \n{user_prompt} [/INST] \n{answer}"
    return prompt


In [103]:
texts = []

In [104]:
for i in range(len(df_oracle)):
    texts.append(make_instruction(df_oracle.context[i],df_oracle.question[i],df_oracle.cot_answer[i]))

In [105]:
pd.DataFrame({"text":texts}).to_csv("Train_data/no_oracle.csv",index=False)