# Data Extraction
Using the library pypdf extract the text from the exam that it is a pdf file

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
from pypdf import PdfReader

Computing the expected total number of rows based in the number of questions and options

In [None]:
def compute_expected_rows(year: int, topic: str) -> tuple:
    info_year_dict: dict = {
        2011: [260, 5],
        2014: [235, 5],
        2018: [235, 4],
        2020: [185, 4],
        2021: [210, 4],
    }
    save_name_dict: dict = {
        "BIOLOGÍA": "bir",
        "FARMACIA": "fir",
        "QUÍMICA": "qir",
        "MEDICINA": "mir",
    }
    max_rows: int = 0
    for target_year, info_list in info_year_dict.items():
        if year <= target_year:
            max_rows = (info_list[0] * info_list[1]) + info_list[0]
            break
    if year >= 2021:
        max_rows = 1050
    return max_rows, f"clean_{save_name_dict[topic]}_{year}.csv"
    # BIR, FIR, QIR, MIR
        # 2024, 2021 = 210 preguntas * 4
        # 2020, 2019 = 185 preguntas * 4
        # 2018, 2015 = 235 preguntas * 4
        # 2014, 2012 = 235 preguntas * 5
        # 2011, 2004 = 260 preguntas * 5

In [None]:
year: int = ...
topic: str = "BIOLOGÍA" # BIOLOGÍA, FARMACIA, QUÍMICA, MEDICINA
total_num_rows, save_format = compute_expected_rows(year, topic)

## Extracting Questions from Pdf file

In [None]:
path: str = f"data/raw/Raw_Cuaderno_{year}_{topic}_0_C.pdf"
with PdfReader(path) as pdf_file:
    full_text = []
    for n in range(2, len(pdf_file.pages)):
        page = pdf_file.pages[n]
        text: str = page.extract_text()
        full_text.exten(text.splitlines())

Creating a Data Frame of the text

In [None]:
exam_df = pd.DataFrame(full_text, columns=["text"])
exam_df.head(10)

## Extracting Answers from .tsv file into a DataFrame

In [None]:
answers_df = pd.read_table(f"data/raw/Raw_Cuaderno_{year}_{topic}_0_C_Respuestas.tsv")
answers_df.head()

# Data Transformation

## Exam pdf

Removing rows containing "Página" and rows that have empty spaces

In [None]:
exam_df = exam_df[~exam_df["text"].str.contains("Página")]
exam_df = exam_df["text"].str.strip()
exam_df = exam_df.replace("", np.nan)
exam_df = exam_df.dropna()
exam_df = exam_df.reset_index(drop=True)
exam_df.head()

Funcition to join the lines 
* Checks if a line ends with "-" meaning that the word is truncated, so it removes the last caracter, appends the line below and deletes the appended line 
* Checks if the following line the the first caracter can be converted into an integer, meaning that it is either a Question or an option, and proceds to append the line below and deletes it

In [None]:
def process_multi_line_str(df):
    i = 0
    while i < len(df) - 1:
        line = df.iloc[i]
        if i < len(df) -1  and line.endswith("-"):
            df.iloc[i] = df.iloc[i][:-1] + df.iloc[i + 1]
            df = df.drop(i + 1)
            df = df.reset_index(drop=True)
        else:
            i += 1
    n = 0
    while n < len(df) -1:
        if n + 1 < len(df):
            try:
                int(df.iloc[n + 1][0:1])
                n += 1
            except ValueError:
                df.iloc[n] = df.iloc[n] + " " + df.iloc[n + 1]
                df = df.drop(n +1)
                df = df.reset_index(drop= True)
        else:
            break
    return df

In [None]:
exam_df_concat = process_multi_line_str(exam_df)
exam_df_concat.head()

Print the rows that do not end in "." or ":" to fix them

In [None]:
num_rows_incorrect_expected = len(exam_df_concat) - total_num_rows
num_rows_incorrect = exam_df_concat[~exam_df_concat.str.endswith((".", ":"))].count()
id_rows_incorrect = exam_df_concat[~exam_df_concat.str.endswith((".", ":"))].index.to_list()
print(f"Number of expected incorrect rows = {num_rows_incorrect}")
print(f"Number of rows not ending with '.' of ':' = {num_rows_incorrect}")
print(id_rows_incorrect)

In [None]:
def print_rows_incorrrect(wrong_id: list, df):
    for wid in wrong_id:
        print(f"Id to fix {wid}:")
        print(df.iloc[wid -2: wid +3])

In [None]:
def fix_incorrect(num_row, df):
    num_row = sorted(num_row, reverse=True)
    for n in num_row:
        df.iloc[n] = df.iloc[n] + df.iloc[n + 1]
        df = df.drop(n + 1)
        df = df.reset_index(drop= True)
    return df

In [None]:
if num_rows_incorrect_expected == num_rows_incorrect:
    exam_df_fixed = fix_incorrect(id_rows_incorrect, exam_df_concat)
    df_correct_rows = len(exam_df_fixed) == num_rows_incorrect_expected
else:
    print_rows_incorrrect(id_rows_incorrect, exam_df_concat)

rows_fix = [55, 559, 516, 612, 703, 810, 838, 840, ]

The expeted number of rows for an exam with 210 questions and 4 options per question is 210+(210*4) = {total_num_rows}
Print true if the number of rows is the expected

In [None]:
exam_df_fixed = fix_incorrect(rows_fix, exam_df_concat)
df_correct_rows = len(exam_df_fixed) == num_rows_incorrect_expected
print(df_correct_rows)
if not df_correct_rows:
    raise(Warning)

In [None]:
exam_df_fixed = exam_df_fixed.to_frame()
groups = pd.Series((exam_df_fixed.index // 5) +1)
exam_df_fixed["group"] = groups  
exam_df_fixed["option_num"] = exam_df_fixed.groupby("group").cumcount() + 1
exam_df_pivot = exam_df_fixed.pivot(index="group", columns="option_num", values="text")
exam_df_pivot = exam_df_pivot.reset_index()
exam_df_pivot = exam_df_pivot.rename_axis(None, axis=1).rename(columns={
    1: "Question",
    2: "Option 1",
    3: "Option 2",
    4: "Option 3",
    5: "Option 4"  # Si tienes 4 opciones, ajusta según corresponda
})
exam_df_pivot = exam_df_pivot.drop(columns=["group"])

exam_df_pivot.head()

## Answers tsv

In [None]:
columnas_base = ['V0', 'RC']
columnas_apiladas = ['V0.1', 'B1', 'V0.2', 'B2', '']

original_col = ["V0", "RC"]
duplicate_cols = ["V0.1", "RC.1", "V0.2", "RC.2", "V0.3", "RC.3", "V0.4", "RC.4"]

answers_df_list = [answers_df[original_col]]

for i in range(0, len(duplicate_cols), 2):
    pair_cols = duplicate_cols[i:i+2]
    df_pair_col = answers_df[pair_cols].rename(columns={pair_cols[0]: "V0", pair_cols[1]: "RC"})
    answers_df_list.append(df_pair_col)

answers_df_clean = pd.concat(answers_df_list, ignore_index=True)

print(answers_df_clean)

## Joining the DataFrames

In [None]:
clean_df = pd.concat([exam_df_pivot, answers_df_clean], axis=1)
clean_df = clean_df.drop(columns=["V0"])
clean_df.head()

In [None]:
print(clean_df[clean_df.isnull().any(axis=1)])

In [None]:
clean_df_2 = clean_df.fillna(0)
clean_df_2.isnull().sum()

In [None]:
clean_df_2["RC"] = clean_df_2["RC"].astype(int)
clean_df_2.dtypes

# Data Loading

## Saving in csv file

In [None]:
clean_df_2["year"] = year
bir_2024 = clean_df_2

bir_2024.to_csv("data/clean_bir_2024.csv", index= False)

## Writing into SQL db