# Prepare Q&A

In [1]:
import pathlib
import pandas as pd
import re
from typing import Iterable, Optional

In [2]:
def list_csv_files(base_path: str | pathlib.Path, recursive: bool = False) -> list[pathlib.Path]:
    """
    Return Path objects for every *.csv file under `given path`.
    If `recursive` is True, look through sub-directories too.
    """
    base = pathlib.Path(base_path).expanduser().resolve()
    pattern = "**/*.csv" if recursive else "*.csv"
    return sorted(base.glob(pattern))


In [3]:
def remove_long_questions(df, col="question", max_len=40, preview_rows=5):
    """
    Splits the input DataFrame into
      • long_q  - rows where len(question) >= max_len
      • clean   - all other rows
    
    Prints `preview_rows` from long_q for transparency.
    """
    # Boolean mask for long questions
    mask = df[col].str.len() >= max_len
    
    long_q = df[mask].copy()
    clean  = df[~mask].copy()
    
    # Optional transparency step
    if not long_q.empty:
        print(f"\n{len(long_q):,} questions ≥ {max_len} chars "
              f"({preview_rows} shown):")
        print(long_q.head(preview_rows)[col].to_string(index=False))
    
    return clean, long_q


In [None]:

def load_and_clean(
    file_path: pathlib.Path,
    *,
    max_len: Optional[int] = 300,
    keywords: Optional[Iterable[str]] = None,
    preview_rows: int = 5,
    verbose=False,
) -> pd.DataFrame:
    """
    Read one quiz CSV, clean it, and (NEW) filter rows
    1. Drop rows whose question length is ≥ max_len
    2. Drop rows whose question contains ANY keywords

    Parameters
    ----------
    file_path : pathlib.Path
        Path to a single quiz CSV.
    max_len : int | None, default None
        Character length threshold for the question filter.
        If None, the length filter is skipped.
    keywords : Iterable[str] | None, default None
        Words/phrases that trigger removal.  If None or empty,
        the keyword filter is skipped.
    preview_rows : int, default 5
        Number of sample rows to print for transparency.

    Returns
    -------
    pd.DataFrame
        Cleaned quiz data.
    """
    df = pd.read_csv(file_path, index_col=0)


    # DROP TF QUESTIONS
    df.columns = [c.strip().lower() for c in df.columns]
    df = df.rename(columns={"questions": "question", "correct": "answer"})
    df = df.dropna(how="any").reset_index(drop=True)

    # map answer text → letter index (0–3)
    def answer_letter(row):
        letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
        for letter in ("A", "B", "C", "D"):
            if row["answer"].strip().casefold() == row[letter.lower()].strip().casefold():
                return letter_to_num[letter]
        return pd.NA

    df["answer"] = df.apply(answer_letter, axis=1)
    df = df.dropna(subset=["answer"])

    # tidy question text
    df["question"] = (
        df["question"]
        .str.replace(r"[\r\n]+", " ", regex=True)
        .str.strip()
    )

    # filtering out questions that are too long
    if max_len is not None:
        len_mask = df["question"].str.len() >= max_len
        if len_mask.any():
            long_q = df.loc[len_mask, "question"]
            if verbose:
                print(
                    f"\n  {len_mask.sum():,} questions ≥ {max_len} characters "
                    f"(showing {preview_rows}):"
                )
                print(long_q.head(preview_rows).to_string(index=False))

        df = df.loc[~len_mask]

    # filtering out keywords
    if keywords:
        # \b(word1|word2|word3)\b  – whole-word, case-insensitive
        escaped = map(re.escape, keywords)
        pattern = rf"\b(?:{'|'.join(escaped)})\b"
        kw_mask = df["question"].str.contains(pattern, case=False, na=False, regex=True)

        if kw_mask.any():
            hit_q = df.loc[kw_mask, "question"]
            if verbose:
                print(
                    f"\n  {kw_mask.sum():,} questions matched keyword filter "
                    f"(showing {preview_rows}):"
                )
                print(hit_q.head(preview_rows).to_string(index=False))

        df = df.loc[~kw_mask]


    df["genre"] = file_path.stem.split("category_")[1]
    df = df[["genre", "question", "answer", "a", "b", "c", "d"]]
    return df.reset_index(drop=True)


In [None]:
BASE_DIR = r"/home/azureuser/trividuel/data_prep/chosen"  

# too much questions regarding harry potter, jesus and the bible
keywords_to_remove = ["jesus", "bible", "harry potter"]

csv_files = list_csv_files(BASE_DIR) 
print(f"Found {len(csv_files)} CSV files")

frames = [load_and_clean(
    p,
    max_len=200,
    keywords=keywords_to_remove,
    verbose=False
) for p in csv_files]

combined = pd.concat(frames, ignore_index=True)

print(combined['question'][0])

combined.head(5)


Found 5 CSV files

  2 questions matched keyword filter (showing 5):
The Bible tells the story of a woman that was t...
What is the name of the river in which Jesus Ch...

  66 questions matched keyword filter (showing 5):
Jesus was fond of using parables to give his me...
      Who wrote the first five books of the Bible?
In the Bible, how many wives and concubines did...
According to the accepted theory, how old was J...
How old was Jesus when He performed His first m...

  3 questions matched keyword filter (showing 5):
What is the largest number mentioned in the Bible?
References to measurements of a circular basin ...
According to the Bible, Eve was made out of whi...
Three of these animals hibernate. Which one does not?


Unnamed: 0,genre,question,answer,a,b,c,d
0,animals,Three of these animals hibernate. Which one do...,1,Mouse,Sloth,Frog,Snake
1,animals,All of these animals are omnivorous except one.,3,Fox,Mouse,Opossum,Snail
2,animals,Three of these Latin names are names of bears....,3,Melursus ursinus,Helarctos malayanus,Ursus minimus,Felis silvestris catus
3,animals,These are typical Australian animals except one.,3,Platypus,Dingo,Echidna,Sloth
4,animals,Representatives of three of these species prod...,3,Lizards,Scorpions,Frogs,Mosquitos


In [8]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   genre     457 non-null    object
 1   question  457 non-null    object
 2   answer    457 non-null    object
 3   a         457 non-null    object
 4   b         457 non-null    object
 5   c         457 non-null    object
 6   d         457 non-null    object
dtypes: object(7)
memory usage: 25.1+ KB


In [9]:
# combined.to_csv("combined_quiz.csv", index=False)
# print("Final File Saved")


In [10]:
# save a copy directly to backend :)
combined.to_csv("../backend/app/data/combined_quiz.csv", index=False)
print("File Saved To backend data")

File Saved To backend data
