# Prepare Q&A

In [1]:
import pathlib
import pandas as pd

In [2]:
def list_csv_files(base_path: str | pathlib.Path, recursive: bool = False) -> list[pathlib.Path]:
    """
    Return Path objects for every *.csv file under `given path`.
    If `recursive` is True, look through sub-directories too.
    """
    base = pathlib.Path(base_path).expanduser().resolve()
    pattern = "**/*.csv" if recursive else "*.csv"
    return sorted(base.glob(pattern))


In [3]:
def load_and_clean(file_path: pathlib.Path) -> pd.DataFrame:
    """
    Read one quiz CSV, drop rows containing *any* NaNs,
    convert 'Correct' text → letter (A/B/C/D),
    and add a 'genre' column from the file name stem.
    """
    
    # read (skip the dummy index column)
    df = pd.read_csv(file_path, index_col=0)
    
    # normalise column names
    df.columns = [c.strip().lower() for c in df.columns]
    df = df.rename(columns={
        "questions": "question",
        "correct":   "answer"
    })
    # drop rows with *any* empty cells
    df = df.dropna(how="any").reset_index(drop=True)
    
    # figure out which letter (A/B/C/D) matches the 'answer' text
    def answer_letter(row):
        letter_to_num = {
            "A": 0,
            "B": 1,
            "C": 2,
            "D": 3,
        }
        for letter in ("A", "B", "C", "D"):
            if row["answer"].strip().casefold() == row[letter.lower()].strip().casefold():
                return letter_to_num[letter]
        return pd.NA
    
    df["answer"] = df.apply(answer_letter, axis=1)
    df = df.dropna(subset=["answer"])

    # replace all the \n with a white space in questions
    df["question"] = (
    df["question"]
      .str.replace(r'[\r\n]+', ' ', regex=True)
      .str.strip()
    )
    
    # add genre column
    df["genre"] = file_path.stem.split("category_")[1]
    df = df[["genre", "question", "answer", "a", "b", "c", "d"]]
    return df


In [4]:
BASE_DIR = r"/home/azureuser/trividuel/qna_preparation/archive"  

csv_files = list_csv_files(BASE_DIR) 
print(f"Found {len(csv_files)} CSV files")

frames = [load_and_clean(p) for p in csv_files]
combined = pd.concat(frames, ignore_index=True)

print(combined['question'][0])

combined.head(5)


Found 22 CSV files
Three of these animals hibernate. Which one does not?


Unnamed: 0,genre,question,answer,a,b,c,d
0,animals,Three of these animals hibernate. Which one do...,1,Mouse,Sloth,Frog,Snake
1,animals,All of these animals are omnivorous except one.,3,Fox,Mouse,Opossum,Snail
2,animals,Three of these Latin names are names of bears....,3,Melursus ursinus,Helarctos malayanus,Ursus minimus,Felis silvestris catus
3,animals,These are typical Australian animals except one.,3,Platypus,Dingo,Echidna,Sloth
4,animals,Representatives of three of these species prod...,3,Lizards,Scorpions,Frogs,Mosquitos


In [5]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41785 entries, 0 to 41784
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   genre     41785 non-null  object
 1   question  41785 non-null  object
 2   answer    41785 non-null  object
 3   a         41785 non-null  object
 4   b         41785 non-null  object
 5   c         41785 non-null  object
 6   d         41785 non-null  object
dtypes: object(7)
memory usage: 2.2+ MB


In [6]:
combined.to_csv("combined_quiz.csv", index=False)
print("Final File Saved")


Final File Saved


In [None]:
# save a copy directly to backend :)
combined.to_csv("../backend/app/data/combined_quiz.csv", index=False)