# Data Extraction
Using the library pypdf extract the text from the exam that it is a pdf file

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
from pypdf import PdfReader
import sqlite3

## Selecting year and type of exam

Computing the expected total number of rows based in the number of questions and options per year using the following criteria:
* From 2024 to 2021 = 210 questions and 4 options
* From 2020 to 2019 = 185 questions and 4 options
* From 2018 to 2015 = 235 questions and 4 options
* From 2014 to 2012 = 235 questions and 5 options
* From 2011 to 2004 = 260 questions and 5 options

In [None]:
def compute_expected_rows(year: int, topic: str) -> tuple:
    info_year_dict: dict = {
        2011: [260, 5],
        2014: [235, 5],
        2018: [235, 4],
        2020: [185, 4],
        2021: [210, 4],
    }
    save_name_dict: dict = {
        "BIOLOGÍA": "bir",
        "FARMACIA": "fir",
        "QUÍMICA": "qir",
        "MEDICINA": "mir",
    }
    max_rows: int = 0
    num_questions: int = 0
    for target_year, info_list in info_year_dict.items():
        if year <= target_year:
            max_rows = (info_list[0] * info_list[1]) + info_list[0]
            num_questions = info_list[1]
            break
    if year >= 2021:
        max_rows = 1050
        num_questions = 4
    return max_rows, f"clean_{save_name_dict[topic]}_{year}.csv", num_questions, save_name_dict[topic]

In [None]:
year: int = ...
topic: str = "BIOLOGÍA" # BIOLOGÍA, FARMACIA, QUÍMICA, MEDICINA
total_num_rows, save_format, num_questions, sql_exam_name = compute_expected_rows(year, topic)

## Extracting Questions from Pdf file

In [None]:
path: str = f"../data/raw/type_1/Raw_Cuaderno_{year}_{topic}_0_C.pdf"
with PdfReader(path) as pdf_file:
    full_text = []
    for n in range(2, len(pdf_file.pages)):
        page = pdf_file.pages[n]
        text: str = page.extract_text()
        full_text.extend(text.splitlines())

Creating a Data Frame of the text

In [None]:
exam_df = pd.DataFrame(full_text, columns=["text"])
exam_df.head(10)

## Extracting Answers from .tsv file into a DataFrame

In [None]:
answers_df = pd.read_table(f"../data/raw/type_1/Raw_Cuaderno_{year}_{topic}_0_C_Respuestas.tsv")
answers_df.head()

# Data Transformation

## Exam pdf

Removing rows containing "Página" and rows that have empty spaces

In [None]:
exam_df = exam_df[~exam_df["text"].str.contains("Página")]
exam_df = exam_df["text"].str.strip()
exam_df = exam_df.replace("", np.nan)
exam_df = exam_df.dropna()
exam_df = exam_df.reset_index(drop=True)
exam_df.head()

Funcition to join the lines 
* Checks if a line ends with "-" meaning that the word is truncated, so it removes the last caracter, appends the line below and deletes the appended line 
* Checks if the following line the the first caracter can be converted into an integer, meaning that it is either a Question or an option, and proceds to append the line below and deletes it

In [None]:
def process_multi_line_str(df):
    i = 0
    while i < len(df) - 1:
        line = df.iloc[i]
        if i < len(df) -1  and line.endswith("-"):
            df.iloc[i] = df.iloc[i][:-1] + df.iloc[i + 1]
            df = df.drop(i + 1)
            df = df.reset_index(drop=True)
        else:
            i += 1
    n = 0
    while n < len(df) -1:
        if n + 1 < len(df):
            try:
                int(df.iloc[n + 1][0:1])
                n += 1
            except ValueError:
                df.iloc[n] = df.iloc[n] + " " + df.iloc[n + 1]
                df = df.drop(n +1)
                df = df.reset_index(drop= True)
        else:
            break
    return df

In [None]:
exam_df_concat = process_multi_line_str(exam_df)
exam_df_concat.head()

Print the rows that do not end in "." or ":" to fix them

In [None]:
num_rows_incorrect_expected = len(exam_df_concat) - total_num_rows
num_rows_incorrect = exam_df_concat[~exam_df_concat.str.endswith((".", ":"))].count()
id_rows_incorrect = exam_df_concat[~exam_df_concat.str.endswith((".", ":"))].index.to_list()
print(f"Number of expected incorrect rows = {num_rows_incorrect_expected}")
print(f"Number of rows not ending with '.' of ':' = {num_rows_incorrect}")
print(id_rows_incorrect)

In [None]:
def print_rows_incorrrect(wrong_id: list, df):
    for wid in wrong_id:
        print(f"Id to fix {wid}:")
        print(df.iloc[wid -2: wid +3])

In [None]:
def fix_incorrect(num_row, df):
    num_row = sorted(num_row, reverse=True)
    for n in num_row:
        df.iloc[n] = df.iloc[n] + df.iloc[n + 1]
        df = df.drop(n + 1)
        df = df.reset_index(drop= True)
    return df

If the number of incorrect rows is the same as expected call fix_incorrect and display num of rows to check if it worked
If there are more or less number of incorrect rows print +-3 rows and add to rows_fix the id of the rows to fix

In [None]:
if num_rows_incorrect_expected == num_rows_incorrect:
    exam_df_fixed = fix_incorrect(id_rows_incorrect, exam_df_concat)
    df_correct_rows = len(exam_df_fixed) == num_rows_incorrect_expected
else:
    print_rows_incorrrect(id_rows_incorrect, exam_df_concat)
    raise Warning("More incorrect rows that expected, uncomment the lines below and add ids to the list, and comment this line")
    # rows_fix = []
    # exam_df_fixed = fix_incorrect(rows_fix, exam_df_concat)
print(f"Correct number of rows = {total_num_rows}")
print(exam_df_fixed.shape[0])


In [None]:
exam_df_fixed = exam_df_fixed.to_frame()
groups = pd.Series((exam_df_fixed.index // num_questions+1) +1)
exam_df_fixed["group"] = groups  
exam_df_fixed["option_num"] = exam_df_fixed.groupby("group").cumcount() + 1
exam_df_pivot = exam_df_fixed.pivot(index="group", columns="option_num", values="text")
exam_df_pivot = exam_df_pivot.reset_index()

key_list: list = [x for x in range(1, num_questions+2, 1)]
val_list: list = ["Question"] + [f"Option_{x}" for x in range(1, 4 + 1, 1)]

exam_df_pivot = exam_df_pivot.rename_axis(None, axis=1).rename(columns=dict(zip(key_list, val_list)))
exam_df_pivot = exam_df_pivot.drop(columns=["group"])

exam_df_pivot.head()

## Answers tsv

In [None]:
original_col = ["V0", "RC"]

duplicate_cols = ["V0.1", "RC.1", "V0.2", "RC.2", "V0.3", "RC.3", "V0.4", "RC.4"]

answers_df_list = [answers_df[original_col]]

for i in range(0, len(duplicate_cols), 2):
    pair_cols = duplicate_cols[i:i+2]
    df_pair_col = answers_df[pair_cols].rename(columns={pair_cols[0]: "V0", pair_cols[1]: "RC"})
    answers_df_list.append(df_pair_col)

answers_df_clean = pd.concat(answers_df_list, ignore_index=True)

print(answers_df_clean)

## Joining the DataFrames

In [None]:
clean_df = pd.concat([exam_df_pivot, answers_df_clean], axis=1)
clean_df = clean_df.drop(columns=["V0"])
clean_df.head()

Printing the rows with null values in RC column

In [None]:
print(clean_df[clean_df.isnull().any(axis=1)])

Filling null values with 0 and printing the sum of null values to check

In [None]:
clean_df_2 = clean_df.fillna(0)
clean_df_2.isnull().sum()

Changing Datatype of RC from float to int

In [None]:
clean_df_2["RC"] = clean_df_2["RC"].astype(int)
clean_df_2.dtypes

In [None]:
clean_df_2.head()

# Data Loading

## Saving in csv file

In [None]:
clean_df_2["year"] = year

clean_df_2.to_csv("data/clean_bir_2024.csv", index= False)

## Writing into SQL db

In [None]:
db_path: str = "../data/clean/bir_warehouse.db"

query_question: str = """
INSERT INTO questions (exam_year, exam_subject, question) 
VALUES((SELECT id_year FROM year WHERE year_name = ?),
    (SELECT id_type FROM exam WHERE exam_type = ?),
    ?);
"""

query_options: str = """
INSERT INTO questions_options (question_id, option_num, option_text, is_correct)
VALUES((SELECT id FROM questions WHERE question = ?),
    ?,
    ?,
    ?)
"""

with sqlite3.connect(db_path) as bir_warehouse:
    cur = bir_warehouse.cursor()
    for question in clean_df_2.itertuples():
        cur.execute(query_question, (str(question[7]), sql_exam_name, question[1]))
        bir_warehouse.commit()
        for n, option in enumerate(question[2:6]):
            cur.execute(query_options, (question[1], n+1, option, n+1 == question[6]))
            bir_warehouse.commit()