In [None]:
# TODO: maybe precalculate the augmentations and include them with the same label

In [27]:
import os
import pandas as pd
import random
# functions for reading data from codenet:
import codenet_reader

In [28]:
# Functions for processing solutions of a problem


def label(pid: str, row) -> str:
    status = 1 if row['status'] == 'Accepted' else 0
    return f'{pid}-{status}'


def process_pid(pid: str, srcs: list[str], lbls: list[str]):
    MIN_SOLUTIONS = 5
    MAX_SOLUTIONS = 250
    
    num_python_solutions = len(codenet_reader.python_solutions(pid))
    if num_python_solutions < 5:
        return
    
    df = codenet_reader.init_metadata_df(pid)
    if df is None:
        return
    
    if df.shape[0] < MIN_SOLUTIONS:
        return
    
    if df.shape[0] > MAX_SOLUTIONS:
        df = df.sample(MAX_SOLUTIONS, ignore_index=True)
    
    print(f'{pid}','-'*50)
    print(f' sampled {df.shape[0]} code snippets')
    
    df['src'] = df.apply(lambda row: codenet_reader.read_solution_file(pid, row['submission_id']), axis=1)
    df['lbl'] = df.apply(lambda row: label(pid, row), axis=1)
    
    new_srcs = df['src']
    new_lbls = df['lbl']
    
    srcs.extend(new_srcs.to_list())
    lbls.extend(new_lbls.to_list())
    
    return new_srcs, new_lbls


def test():
    return process_pid('p03200', [], [])
    #return process_pid('p01701', [], [])

In [29]:
test()

p03200 --------------------------------------------------
 sampled 250 code snippets


(0      S=list(input())\nans=0\nbl=0\nfor s in S:\n  i...
 1      S = list(input())\n\nblacks = []\nfor i in ran...
 2      import sys\na = input()\nS = [s for s in a]\n#...
 3      s = input()\nans = 0\n\ndef reverse(s,i):\n  i...
 4      import re\nimport sys\nimport math\nimport ite...
                              ...                        
 245    s = input()\ncnt_b = 0\nans = 0\nfor c in s:\n...
 246    S = input()[::-1]\nw = 0\nans = 0\nfor e in S:...
 247    S = list(input())\n\ncnt = 0\nfor i in range(l...
 248    def myAnswer(S:list) -> int:\n   N = len(S)\n ...
 249    import math\nimport fractions\nimport bisect\n...
 Name: src, Length: 250, dtype: object,
 0      p03200-1
 1      p03200-1
 2      p03200-0
 3      p03200-0
 4      p03200-1
          ...   
 245    p03200-1
 246    p03200-1
 247    p03200-0
 248    p03200-0
 249    p03200-1
 Name: lbl, Length: 250, dtype: object)

In [33]:
# Reading data

SAMPLE_SIZE = 500

_pids = os.listdir(codenet_reader.DATA_PATH)
_pids = random.sample(_pids, k=min(SAMPLE_SIZE, len(_pids)))

srcs = []
lbls = []

print('Starting...')
print(f'Processing {len(_pids)} problems.')

for _pid in _pids:
    process_pid(_pid, srcs, lbls)

n_srcs = len(srcs)
n_lbls = len(lbls)
assert n_srcs == n_lbls

accepted_count = sum(1 for lbl in lbls if lbl[-1]=='1')
rejected_count = n_srcs - accepted_count

print('Data gathered...')
print(f'{len(srcs)} code snippets:')
print(f' {accepted_count} accepted')
print(f' {rejected_count} rejected')

output_ls = list(zip(srcs, lbls))
output_df = pd.DataFrame(output_ls, columns=['source', 'label'])

print('DONE')

Starting...
Processing 500 problems.
p02394 --------------------------------------------------
 sampled 250 code snippets
p03244 --------------------------------------------------
 sampled 250 code snippets
p02340 --------------------------------------------------
 sampled 7 code snippets
p02890 --------------------------------------------------
 sampled 97 code snippets
p03037 --------------------------------------------------
 sampled 250 code snippets
p03504 --------------------------------------------------
 sampled 250 code snippets
p02439 --------------------------------------------------
 sampled 27 code snippets
p03005 --------------------------------------------------
 sampled 250 code snippets
p04000 --------------------------------------------------
 sampled 250 code snippets
p02952 --------------------------------------------------
 sampled 250 code snippets
p03210 --------------------------------------------------
 sampled 250 code snippets
p02893 -------------------------

In [34]:
output_df

Unnamed: 0,source,label
0,"W,H,x,y,r = map(int,input().split())\nFlag=Tru...",p02394-1
1,"W,H,x,y,r=map(int,input().split())\nif x>=0 an...",p02394-1
2,"W, H, x, y, r = map(int, input().split())\n\ni...",p02394-1
3,"w, h, x, y, r = map(int, input().split())\nres...",p02394-1
4,"W,H,x,y,r=list(map(int,input().split()))\nif x...",p02394-1
...,...,...
34801,"from collections import deque\nH, W = map(int,...",p02803-0
34802,"from collections import deque\n \nh, w = map(i...",p02803-1
34803,from collections import deque\n''' \nINF = flo...,p02803-0
34804,"H, W = map(int, input().split())\nS = []\nfor ...",p02803-1


In [35]:
# Exporting
EXPORTED_FILE_NAME = 'output_single.csv'
output_df.to_csv(EXPORTED_FILE_NAME)