In [1]:
# TODO: maybe precalculate the augmentations and include them with the same label

In [2]:
import os
import pandas as pd
import random
# functions for reading data from codenet:
import codenet_reader

In [3]:
# Functions for processing solutions of a problem


def label(pid: str, row) -> str:
    status = 1 if row['status'] == 'Accepted' else 0
    return f'{pid}-{status}'


def process_pid(pid: str, srcs: list[str], lbls: list[str]):
    MIN_SOLUTIONS = 5
    MAX_SOLUTIONS = 250
    
    num_python_solutions = len(codenet_reader.python_solutions(pid))
    if num_python_solutions < 5:
        return
    
    df = codenet_reader.init_metadata_df(pid)
    df = df[df['status'].isin({'Accepted', 'Wrong Answer'})]
    
    if df is None:
        return
    
    if df.shape[0] < MIN_SOLUTIONS:
        return
    
    if df.shape[0] > MAX_SOLUTIONS:
        df = df.sample(MAX_SOLUTIONS, ignore_index=True)
    
    print(f'{pid}','-'*50)
    print(f' sampled {df.shape[0]} code snippets')
    
    df['src'] = df.apply(lambda row: codenet_reader.read_solution_file(pid, row['submission_id']), axis=1)
    df['lbl'] = df.apply(lambda row: label(pid, row), axis=1)
    
    new_srcs = df['src']
    new_lbls = df['lbl']
    
    srcs.extend(new_srcs.to_list())
    lbls.extend(new_lbls.to_list())
    
    return new_srcs, new_lbls


def test():
    return process_pid('p03200', [], [])
    #return process_pid('p01701', [], [])

In [4]:
test()

p03200 --------------------------------------------------
 sampled 250 code snippets


(0      def main():\n    S = input()\n    w = 0\n    r...
 1      S = input()\n\nans = 0\ncnt = 0\nfor i in rang...
 2      s = input()\nn = len(s)\n\nb = 0\nb_count = 0\...
 3      import sys\n\n\ndef main():\n    s = sys.stdin...
 4      S=input()\nS=S[::-1]\nans=0\ncount=0\nfor i in...
                              ...                        
 245    import sys\n# input = sys.stdin.readline\n\nde...
 246    A = list(input())\n\ncount = 0\nans = 0\nfor a...
 247    #template\ndef inputlist(): return [int(j) for...
 248    S = str(input())\ncounter = 0\nans = 0\nfor i,...
 249    def resolve():\n    s = str(input())\n    ans ...
 Name: src, Length: 250, dtype: object,
 0      p03200-1
 1      p03200-1
 2      p03200-1
 3      p03200-1
 4      p03200-1
          ...   
 245    p03200-1
 246    p03200-1
 247    p03200-1
 248    p03200-1
 249    p03200-1
 Name: lbl, Length: 250, dtype: object)

In [5]:
# Reading data

SAMPLE_SIZE = 500

_pids = os.listdir(codenet_reader.DATA_PATH)
_pids = random.sample(_pids, k=min(SAMPLE_SIZE, len(_pids)))

srcs = []
lbls = []

print('Starting...')
print(f'Processing {len(_pids)} problems.')

for _pid in _pids:
    process_pid(_pid, srcs, lbls)

n_srcs = len(srcs)
n_lbls = len(lbls)
assert n_srcs == n_lbls

accepted_count = sum(1 for lbl in lbls if lbl[-1]=='1')
rejected_count = n_srcs - accepted_count

print('Data gathered...')
print(f'{len(srcs)} code snippets:')
print(f' {accepted_count} accepted')
print(f' {rejected_count} rejected')

output_ls = list(zip(srcs, lbls))
output_df = pd.DataFrame(output_ls, columns=['source', 'label'])

print('DONE')

Starting...
Processing 500 problems.
p03422 --------------------------------------------------
 sampled 22 code snippets
p02367 --------------------------------------------------
 sampled 14 code snippets
p00094 --------------------------------------------------
 sampled 158 code snippets
p03789 --------------------------------------------------
 sampled 7 code snippets
p02783 --------------------------------------------------
 sampled 250 code snippets
p02417 --------------------------------------------------
 sampled 250 code snippets
p02861 --------------------------------------------------
 sampled 250 code snippets
p02866 --------------------------------------------------
 sampled 250 code snippets
p02680 --------------------------------------------------
 sampled 250 code snippets
p00027 --------------------------------------------------
 sampled 10 code snippets
p02451 --------------------------------------------------
 sampled 11 code snippets
p00412 ---------------------------

In [6]:
output_df

Unnamed: 0,source,label
0,import sys\nread = sys.stdin.buffer.read\nread...,p03422-1
1,import sys\nread = sys.stdin.buffer.read\nread...,p03422-0
2,N = int(input())\ng = 0\nfor _ in range(N):\n ...,p03422-1
3,"N = int(input())\n\ndef grundy(x, k):\n whi...",p03422-1
4,"A,I=0,input\nfor i in range(int(I())):\n a,k=m...",p03422-1
...,...,...
36818,"from operator import itemgetter\n\n\nn, x, d =...",p02840-1
36819,"from operator import itemgetter\n\n\nn, x, d =...",p02840-0
36820,"from operator import itemgetter\n\n\nn, x, d =...",p02840-0
36821,"from operator import itemgetter\n\n\nn, x, d =...",p02840-0


In [7]:
# Exporting
EXPORTED_FILE_NAME = 'output_single.csv'
output_df.to_csv(EXPORTED_FILE_NAME)