In [1]:
data_dir = "../data"

In [2]:
import pandas as pd
import os

problems_vars = ["index", "rating"]
users_vars = ["handle", "country", "city", "rating", "max_rating"]
submissions_vars = ["contest_id", "problem_index", "author_handle", "programming_language", "relative_time_seconds"]

def process_contest_data(path, start_date, i):
    print(".", end = "")
    
    if (i != 0) and ((i + 1) % 50 == 0):
        print("  ", i + 1)
    
    contest = pd.read_csv(os.path.join(path, "contest.csv"))
    if contest["start_time_seconds"].values[0] < start_date:
        return pd.DataFrame()

    problems = pd.read_csv(os.path.join(path, "problems.csv"))[problems_vars]
    ok_submissions = pd.read_csv(os.path.join(path, "submissions.csv")).query("verdict == 'OK'")[submissions_vars]

    contest_id, contest_name, contest_start_time = contest[["id", "name", "start_time_seconds"]].values[0]

    # Sample data setup
    # problems dataframe with 'index' column for problem positions and 'rating' column for difficulty
    # ok_submissions dataframe with 'author_handle', 'problem_index', 'programming_language', and 'relative_time_seconds' columns

    # Ensure the "index" column is of string type to handle alphanumeric characters
    problems['index'] = problems['index'].astype(str)

    # Get a sorted list of unique problem positions and their corresponding ratings from the problems dataframe
    problem_positions = sorted(problems['index'].unique())
    position_mapping = {pos: i + 1 for i, pos in enumerate(problem_positions)}  # Map each position to an ordered number
    rating_mapping = problems.set_index('index')['rating'].to_dict()  # Map each problem index to its rating

    # Initialize a new dataframe with authors as rows and columns for each problem position:
    # one for completion status (finished_[x]), one for programming language ([x]_language), and one for relative time (relative_time_[x])
    authors = ok_submissions['author_handle'].unique()
    finished_columns = [f'finished_{position_mapping[pos]}' for pos in problem_positions]
    language_columns = [f'{position_mapping[pos]}_language' for pos in problem_positions]
    time_columns = [f'relative_time_{position_mapping[pos]}' for pos in problem_positions]
    time_to_answer_columns = [f'time_to_answer_{position_mapping[pos]}' for pos in problem_positions]
    all_columns = finished_columns + language_columns + time_columns + time_to_answer_columns + ['rating_achieved']
    finished_df = pd.DataFrame(index=authors, columns=all_columns)
    finished_df[finished_columns] = False  # Fill completion columns with False initially
    finished_df['rating_achieved'] = 0  # Initialize 'rating_achieved' with 0

    # Calculate `time_to_answer` in `ok_submissions`
    ok_submissions = ok_submissions.sort_values(by=['author_handle', 'relative_time_seconds'])
    ok_submissions['time_to_answer'] = ok_submissions.groupby('author_handle')['relative_time_seconds'].diff().fillna(ok_submissions['relative_time_seconds'])

    # Populate the dataframe with True for completion status, programming language, and calculate rating achieved and relative time
    for _, submission in ok_submissions.iterrows():
        author = submission['author_handle']
        problem_index = submission['problem_index']
        language = submission['programming_language']
        relative_time = submission['relative_time_seconds']
        time_to_answer = submission['time_to_answer']
        if problem_index in position_mapping:  # Check to avoid any missing mappings
            finished_col = f'finished_{position_mapping[problem_index]}'
            language_col = f'{position_mapping[problem_index]}_language'
            time_col = f'relative_time_{position_mapping[problem_index]}'
            time_to_answer_col = f'time_to_answer_{position_mapping[problem_index]}'
            
            # Set completion and language columns
            finished_df.loc[author, finished_col] = True
            finished_df.loc[author, language_col] = language
            
            # Update rating achieved
            finished_df.loc[author, 'rating_achieved'] += rating_mapping.get(problem_index, 0)

            # Set time to answer column
            finished_df.loc[author, time_to_answer_col] = time_to_answer
            
            # Set or update the minimum relative time for this problem submission
            if pd.isna(finished_df.loc[author, time_col]) or finished_df.loc[author, time_col] > relative_time:
                finished_df.loc[author, time_col] = relative_time

    # Reset index if needed to make 'author_handle' a column
    finished_df = finished_df.reset_index().rename(columns={'index': 'author_handle'})
    finished_df["contest_id"] = contest_id
    finished_df["contest_name"] = contest_name
    finished_df["contest_start_time"] = contest_start_time

    # Perform a left merge on `users` and `finished_df`
    # merged_df = users.merge(finished_df, left_on='handle', right_on='author_handle', how='left')

    # # Drop the redundant `author_handle` column if needed
    # merged_df = merged_df.drop(columns=['handle'])

    finished_df["rating_achieved"] = finished_df["rating_achieved"].fillna(0)
    return finished_df

In [None]:
from joblib import Parallel, delayed, parallel_config

start_date = 1609459200 # unix time for the start of the desired data
data_paths = [os.path.join(data_dir, contest_dir) for contest_dir in os.listdir(data_dir) if "codeforces-round-978-div-2" not in contest_dir]

processed_dfs = []
user_dfs = []
print(f"Processing {len(data_paths)} folders. This might take a while")
print(" ---+--- 1 ---+--- 2 ---+--- 3 ---+--- 4 ---+--- 5")

with parallel_config(n_jobs = -1):
    master_df = pd.concat(Parallel()(delayed(process_contest_data)(path, start_date, i) for i, path in enumerate(data_paths)))
    user_df = pd.concat(Parallel()(delayed(pd.read_csv)(os.path.join(path, "users.csv"), usecols = users_vars) for _, path in enumerate(data_paths))).drop_duplicates(subset = "handle").reset_index(drop = True)

Processing 1675 folders. This might take a while
 ---+--- 1 ---+--- 2 ---+--- 3 ---+--- 4 ---+--- 5


In [None]:
final = master_df.merge(user_df, left_on = "author_handle", right_on = "handle").drop(columns = ["handle"]).sort_values(["contest_start_time", "author_handle"]).reset_index(drop = True)

In [22]:
final.to_csv(os.path.join(data_dir, "master.csv"), index = False)