## Import libraries

In [1]:
import os
import json
import requests
import re

from bs4 import BeautifulSoup

In [2]:
TOTAL_QUESTIONS_PARSED = 0
PARSING_RESULT_PATH = "./questions_data/data/"
RAW_PATH = "./questions_data/raw/"

os.makedirs(PARSING_RESULT_PATH, exist_ok=True)

## 1. Data-Science-Interview-Questions-Answers
https://github.com/youssefHosni/Data-Science-Interview-Questions-Answers

### Github

In [3]:
# main_url = "https://raw.githubusercontent.com/youssefHosni/Data-Science-Interview-Questions-Answers/refs/heads/main/README.md"

# parsed from here already to files
# github_questions_urls = [
#     "https://raw.githubusercontent.com/youssefHosni/Data-Science-Interview-Questions-Answers/refs/heads/main/Machine%20Learning%20Interview%20Questions%20%26%20Answers%20for%20Data%20Scientists.md",
#     "https://raw.githubusercontent.com/youssefHosni/Data-Science-Interview-Questions-Answers/refs/heads/main/Deep%20Learning%20Questions%20%26%20Answers%20for%20Data%20Scientists.md",
#     "https://raw.githubusercontent.com/youssefHosni/Data-Science-Interview-Questions-Answers/refs/heads/main/Statistics%20Interview%20Questions%20%26%20Answers%20for%20Data%20Scientists.md",
#     "https://raw.githubusercontent.com/youssefHosni/Data-Science-Interview-Questions-Answers/refs/heads/main/Probability%20Interview%20Questions%20%26%20Answers%20for%20Data%20Scientists.md",
#     "https://raw.githubusercontent.com/youssefHosni/Data-Science-Interview-Questions-Answers/refs/heads/main/Python%20Interview%20Questions%20%26%20Answers%20for%20Data%20Scientists.md",
#     "https://raw.githubusercontent.com/youssefHosni/Data-Science-Interview-Questions-Answers/refs/heads/main/SQL%20%26%20DB%20Interview%20Questions%20%26%20Answers%20for%20Data%20Scientists.md",
#     "https://raw.githubusercontent.com/youssefHosni/Data-Science-Interview-Questions-Answers/refs/heads/main/Resume%20Based%20Questions.md"
# ]

In [4]:
github_mds_path = os.path.join(RAW_PATH, "data-science-interview-questions-mds/")
github_mds_files = os.listdir(github_mds_path)

def parse_questions_answers(content: str) -> list[dict[str, str]]:
    # Regular expression to identify question headers
    # Modified regex to handle both "? ###" and "?###" formats
    question_pattern = re.compile(r"### Q\d+: (.+)")
    answer_start_pattern = re.compile(r"Answer:\s*(.*)", re.IGNORECASE)
    
    # List to hold the parsed questions and answers
    qa_list = []
    
    current_question = None
    current_answer = []
    
    # Split the content into lines
    lines = content.splitlines()
    
    def clean_answer(answer_lines):
        """ Helper function to clean answer lines """
        return "\n".join(answer_lines).strip()
    
    # Loop through each line to extract questions and answers
    for line in lines:
        # Match questions
        question_match = question_pattern.match(line)
        
        if question_match:
            # If we encounter a new question, save the previous question-answer pair (if any)
            if current_question:
                qa_list.append({
                    "question": current_question,
                    "answer": clean_answer(current_answer) if current_answer else ""
                })
            
            # Store the new question and reset the answer collection
            current_question = question_match.group(1)
            current_answer = []  # Reset for the new question
    
        else:
            # Match and capture the answer when 'Answer:' keyword is found
            answer_match = answer_start_pattern.match(line)
            if answer_match:
                # Start the answer with any text found on the same line after 'Answer:'
                current_answer.append(answer_match.group(1))
            elif current_question and current_answer is not None and not question_pattern.match(line):
                # Accumulate answer lines (anything after 'Answer:' until next question)
                current_answer.append(line)

    # Add the last question-answer pair after the loop
    if current_question:
        qa_list.append({
            "question": current_question,
            "answer": clean_answer(current_answer) if current_answer else ""
        })
    return qa_list


In [5]:
for file_name in github_mds_files:
    print(f"Now in {file_name}")
    concept_name = '_'.join(file_name.split('.')[0].split(' ')[:3]).lower()
    print(concept_name)

    with open(os.path.join(github_mds_path, file_name), 'r') as f:
        content = f.read()
        qa_list = parse_questions_answers(content)

    with open(f"questions_data/data/1_{concept_name}_parsed_questions.json", 'w') as f:
        json.dump(qa_list, f)

    print(f"Successfully parsed {len(qa_list)} questions and answers.")
    TOTAL_QUESTIONS_PARSED += len(qa_list)

    print()

print(f"Total questions parsed: {TOTAL_QUESTIONS_PARSED}")

Now in Machine Learning Interview Questions & Answers for Data Scientists.md
machine_learning_interview
Successfully parsed 36 questions and answers.

Now in Python Interview Questions & Answers for Data Scientists.md
python_interview_questions
Successfully parsed 14 questions and answers.

Now in Probability Interview Questions & Answers for Data Scientists.md
probability_interview_questions
Successfully parsed 17 questions and answers.

Now in Resume Based Questions.md
resume_based_questions
Successfully parsed 2 questions and answers.

Now in Deep Learning Questions & Answers for Data Scientists.md
deep_learning_questions
Successfully parsed 44 questions and answers.

Now in SQL & DB Interview Questions & Answers for Data Scientists.md
sql_&_db
Successfully parsed 12 questions and answers.

Now in Statistics Interview Questions & Answers for Data Scientists.md
statistics_interview_questions
Successfully parsed 17 questions and answers.

Total questions parsed: 142


### Medium

In [6]:
# downloaded pages already to files
# medium_question_urls = [
#     "https://levelup.gitconnected.com/top-large-language-models-llms-interview-questions-answers-d7b83f94c4e",
#     "https://levelup.gitconnected.com/top-computer-vision-interview-questions-answers-part-1-7eddf45cfdf7",
#     "https://levelup.gitconnected.com/top-computer-vision-interview-questions-answers-part-2-107244fc4289",
#     "https://levelup.gitconnected.com/top-computer-vision-interview-questions-answers-part-3-1e43909131b2"
# ]

In [8]:
medium_html_path = os.path.join(RAW_PATH, "data-science-interview-questions-medium/")
medium_html_files = [file_name for file_name in os.listdir(medium_html_path) if file_name.endswith('html')]
medium_html_files

['Top Large Language Models (LLMs) Interview Questions & Answers _ by Youssef Hosni _ Level Up Coding.html',
 'Top Computer Vision Interview Questions & Answers [Part 2] _ by Youssef Hosni _ Level Up Coding.html',
 'Top Computer Vision Interview Questions & Answers [Part 3] _ by Youssef Hosni _ Level Up Coding.html',
 'Top Computer Vision Interview Questions & Answers [Part 1] _ by Youssef Hosni _ Level Up Coding.html']

In [54]:
def get_concept_name(file_name: str) -> str:
    main_concept = '_'.join(file_name.split(' ')[:4]).lower()

    bracket_splits = file_name.split('[')
    if len(bracket_splits) == 1:
        return main_concept

    part_number = '_'.join(bracket_splits[1].split(']')[0].split(' ')).lower()
    return main_concept + '_' + part_number

get_concept_name(medium_html_files[0]), get_concept_name(medium_html_files[3])

('top_large_language_models', 'top_computer_vision_interview_part_1')

In [55]:
def parse_questions_answers_from_html(html_content: str):
    # Initialize BeautifulSoup with the HTML content
    soup = BeautifulSoup(html_content, "lxml")
    
    # Lists to store the extracted questions and answers
    qa_list = []
    
    # Find all sections of the HTML that correspond to questions
    question_tags = soup.find_all('h1')  # Assuming questions are in h1 tags
    
    # Loop through each question and gather all content until the next question
    for i, question_tag in enumerate(question_tags):
        # Extract the question text
        question_text = extract_text_with_formatting(question_tag)
        
        # Find all the elements between this question and the next question
        answer_elements = []
        next_question_tag = question_tags[i + 1] if i + 1 < len(question_tags) else None
        
        # Collect the elements between the current question and the next question
        sibling = question_tag.find_next_sibling()
        while sibling and sibling != next_question_tag:
            answer_elements.append(sibling)
            sibling = sibling.find_next_sibling()

        # Combine all answer elements into a single formatted string
        answer_text = '\n'.join([extract_text_with_formatting(el) for el in answer_elements])
        
        # Store the question and answer in the qa_list
        qa_list.append({
            "question": question_text,
            "answer": answer_text
        })
    
    return qa_list

def extract_text_with_formatting(element):
    """ Helper function to extract text from an HTML element, preserving bold and italic. """
    if element is None:
        return ''
    
    text = ""
    for content in element.contents:
        if content.name == 'strong':
            text += f"**{content.get_text()}**"  # Markdown format for bold
        elif content.name == 'em':
            text += f"*{content.get_text()}*"  # Markdown format for italics
        elif isinstance(content, str):
            text += content
        else:
            text += extract_text_with_formatting(content)  # Recursive call for nested elements
    
    return text

In [56]:
for medium_html_file in medium_html_files:
    print(f"Now in {medium_html_file}")
    with open(os.path.join(medium_html_path, medium_html_file), "r", encoding="utf-8") as html_f:
        html_content = html_f.read()

    # Get concept name
    concept_name = get_concept_name(medium_html_file)
    print(concept_name)

    # Parse questions and answers
    qa_list = parse_questions_answers_from_html(html_content)

    with open(f"questions_data/data/1_{concept_name}_medium_parsed_questions.json", 'w') as f:
        json.dump(qa_list, f)

    print(f"Successfully parsed {len(qa_list)} questions and answers.")
    TOTAL_QUESTIONS_PARSED += len(qa_list)

    print()

print(f"Total questions parsed: {TOTAL_QUESTIONS_PARSED}")

Now in Top Large Language Models (LLMs) Interview Questions & Answers _ by Youssef Hosni _ Level Up Coding.html
top_large_language_models
Successfully parsed 24 questions and answers.

Now in Top Computer Vision Interview Questions & Answers [Part 2] _ by Youssef Hosni _ Level Up Coding.html
top_computer_vision_interview_part_2
Successfully parsed 29 questions and answers.

Now in Top Computer Vision Interview Questions & Answers [Part 3] _ by Youssef Hosni _ Level Up Coding.html
top_computer_vision_interview_part_3
Successfully parsed 23 questions and answers.

Now in Top Computer Vision Interview Questions & Answers [Part 1] _ by Youssef Hosni _ Level Up Coding.html
top_computer_vision_interview_part_1
Successfully parsed 4 questions and answers.

Total questions parsed: 222


## 2. Data-Science-Interviews

https://github.com/alexeygrigorev/data-science-interviews/tree/master

In [6]:
github_mds_path = os.path.join(RAW_PATH, "data-science-interviews-mds/")
github_mds_files = os.listdir(github_mds_path)

github_mds_files

['theory.md']

P.S. Решил дропнуть technical.md, так как там в основном SQL, алгоритмы и просто Python, пока обойдемся без таких практических вопросов и сконцентрируемся на ML

In [7]:
def parse_questions_answers(content: str) -> list[dict[str, str]]:
    # Regular expression to identify question headers
    question_pattern = re.compile(r"\*\*(.+?)\*\*")

    # Regular expression to identify section headers (lines that start with "## ")
    section_header_pattern = re.compile(r"^##\s+.+")

    # List to hold the parsed questions and answers
    qa_list = []

    current_question = None
    current_answer = []

    # Split the content into lines
    lines = content.splitlines()

    def clean_answer(answer_lines):
        """ Helper function to clean answer lines """
        return "\n".join(answer_lines).strip()

    # Loop through each line to extract questions and answers
    for line in lines:
        # Match questions (lines with "**" on both sides)
        question_match = question_pattern.match(line)

        if question_match:
            # If we encounter a new question, save the previous question-answer pair (if any)
            if current_question:
                qa_list.append({
                    "question": current_question,
                    "answer": clean_answer(current_answer) if current_answer else ""
                })

            # Store the new question and reset the answer collection
            current_question = question_match.group(1).strip()  # Clean up any leading/trailing spaces
            current_answer = []  # Reset for the new question

        # Skip section headers
        elif section_header_pattern.match(line):
            # If it's a section header, we skip it and do not add it to the answer
            continue

        else:
            # Accumulate answer lines (anything after the question)
            current_answer.append(line)

    # Add the last question-answer pair after the loop
    if current_question:
        qa_list.append({
            "question": current_question,
            "answer": clean_answer(current_answer) if current_answer else ""
        })

    return qa_list


In [8]:
for file_name in github_mds_files:
    print(f"Now in {file_name}")
    concept_name = file_name.split('.')[0]
    print(concept_name)

    with open(os.path.join(github_mds_path, file_name), 'r') as f:
        content = f.read()
        qa_list = parse_questions_answers(content)

    with open(f"questions_data/data/2_{concept_name}_parsed_questions.json", 'w') as f:
        json.dump(qa_list, f)

    print(f"Successfully parsed {len(qa_list)} questions and answers.")
    TOTAL_QUESTIONS_PARSED += len(qa_list)

    print()

print(f"Total questions parsed: {TOTAL_QUESTIONS_PARSED}")

Now in theory.md
theory
Successfully parsed 166 questions and answers.

Total questions parsed: 308


## 3. Data-Science-Interview-Questions-And-Answers

https://github.com/iamtodor/data-science-interview-questions-and-answers/tree/master

In [15]:
github_mds_path = os.path.join(RAW_PATH, "data-science-interview-questions-and-answers-mds/")
github_mds_files = os.listdir(github_mds_path)

github_mds_files

['main.md']

In [18]:
import re

def parse_questions_answers(content: str) -> list[dict[str, str]]:
    # Regular expression to identify question headers (e.g., "## 1. Why do you use feature selection?")
    question_pattern = re.compile(r"^##\s*\d+\.\s*(.+)")
    
    # Regular expression to identify sub-section headers within the answers (e.g., "#### Filter Methods")
    subsection_pattern = re.compile(r"^####\s*(.+)")
    
    # List to hold the parsed questions and answers
    qa_list = []
    
    current_question = None
    current_answer = []
    
    # Split the content into lines
    lines = content.splitlines()
    
    def clean_answer(answer_lines):
        """ Helper function to clean answer lines """
        return "\n".join(answer_lines).strip()
    
    # Loop through each line to extract questions and answers
    for line in lines:
        # Match questions (lines that start with "## <number>.")
        question_match = question_pattern.match(line.strip())

        if question_match:
            # If we encounter a new question, save the previous question-answer pair (if any)
            if current_question:
                qa_list.append({
                    "question": current_question.strip(),
                    "answer": clean_answer(current_answer) if current_answer else ""
                })
            
            # Store the new question and reset the answer collection
            current_question = question_match.group(1).strip()  # Extract the question text
            current_answer = []  # Reset for the new question
        
        # Match subsection headers (lines that start with "####"), these belong to the answer
        elif subsection_pattern.match(line.strip()):
            # Add subsections as part of the answer (e.g., "#### Filter Methods")
            current_answer.append(f"\n\n####{subsection_pattern.match(line.strip()).group(1)}\n\n")
        
        else:
            # Accumulate answer lines (anything after the question that isn't a new question header)
            current_answer.append(line)

    # Add the last question-answer pair after the loop
    if current_question:
        qa_list.append({
            "question": current_question,
            "answer": clean_answer(current_answer) if current_answer else ""
        })
    
    return qa_list

In [19]:
TOTAL_QUESTIONS_PARSED = 308

file_name = github_mds_files[0]
print(f"Now in {file_name}")
concept_name = file_name.split('.')[0]
print(concept_name)

with open(os.path.join(github_mds_path, file_name), 'r') as f:
    content = f.read()
    qa_list = parse_questions_answers(content)

with open(f"questions_data/data/3_{concept_name}_parsed_questions.json", 'w') as f:
    json.dump(qa_list, f)

print(f"Successfully parsed {len(qa_list)} questions and answers.")
TOTAL_QUESTIONS_PARSED += len(qa_list)

print()

print(f"Total questions parsed: {TOTAL_QUESTIONS_PARSED}")

Now in main.md
main
Successfully parsed 33 questions and answers.

Total questions parsed: 341
