# Combining Datasets

In [2]:
import pandas as pd

### Load datasets

In [5]:
# Define the maximum number of rows to load
max_rows = 7000

In [25]:
# Load datasets from different sources, and rename the column names to be consistent
atcoder_data = pd.read_csv('IPC_Datasets/atcoder_solutions.csv', nrows=max_rows).rename(columns={'source_code': 'solution'})
codechef_data = pd.read_csv('IPC_Datasets/codechef_solutions.csv', nrows=max_rows)
codeforces_data = pd.read_csv('IPC_Datasets/codeforces_submissions.csv', nrows=max_rows).rename(columns={'source_code': 'solution'})
cses_data = pd.read_csv('IPC_Datasets/cses_solutions.csv', nrows=max_rows)
hackerearth_data = pd.read_csv('IPC_Datasets/hackerearth_solutions.csv', nrows=max_rows)
leetcode_data = pd.read_csv('IPC_Datasets/leetcode_solutions.csv', nrows=max_rows)
uva_data = pd.read_csv('IPC_Datasets/uva_solutions.csv', nrows=max_rows).rename(columns={'Solution': 'solution', 'Problem Link': 'problem_link', 'Repository': 'source'})
yosupo_data = pd.read_csv('IPC_Datasets/yosupo_solutions.csv', nrows=max_rows)

In [26]:
# Print the number of rows in each dataset
print('AtCoder:', atcoder_data.shape[0])
print('CodeChef:', codechef_data.shape[0])
print('Codeforces:', codeforces_data.shape[0])
print('CSES:', cses_data.shape[0])
print('HackerEarth:', hackerearth_data.shape[0])
print('LeetCode:', leetcode_data.shape[0])
print('UVa:', uva_data.shape[0])
print('Yosupo:', yosupo_data.shape[0])
print('Total:', atcoder_data.shape[0] + codechef_data.shape[0] + codeforces_data.shape[0] + cses_data.shape[0] + hackerearth_data.shape[0] + leetcode_data.shape[0] + uva_data.shape[0] + yosupo_data.shape[0])

AtCoder: 7000
CodeChef: 7000
Codeforces: 229
CSES: 1175
HackerEarth: 7000
LeetCode: 3570
UVa: 1347
Yosupo: 195
Total: 27516


### Add Online Judge column to the datasets

In [27]:
# Add 'online_judge' column to each dataset to identify the source
atcoder_data['online_judge'] = 'AtCoder'
codechef_data['online_judge'] = 'CodeChef'
codeforces_data['online_judge'] = 'Codeforces'
cses_data['online_judge'] = 'CSES'
hackerearth_data['online_judge'] = 'HackerEarth'
leetcode_data['online_judge'] = 'LeetCode'
uva_data['online_judge'] = 'UVa'
yosupo_data['online_judge'] = 'yosupo'

In [29]:
# Print the column names in alphabetical order for each dataset
print('AtCoder:', sorted(atcoder_data.columns))
print('CodeChef:', sorted(codechef_data.columns))
print('Codeforces:', sorted(codeforces_data.columns))
print('CSES:', sorted(cses_data.columns))
print('HackerEarth:', sorted(hackerearth_data.columns))
print('LeetCode:', sorted(leetcode_data.columns))
print('UVa:', sorted(uva_data.columns))
print('Yosupo:', sorted(yosupo_data.columns))

AtCoder: ['id', 'online_judge', 'problem_link', 'solution', 'submission_link']
CodeChef: ['id', 'online_judge', 'problem_link', 'solution']
Codeforces: ['id', 'language', 'memory', 'online_judge', 'solution', 'submission_link', 'time']
CSES: ['id', 'online_judge', 'problem_link', 'solution', 'source']
HackerEarth: ['id', 'online_judge', 'problem_name', 'solution']
LeetCode: ['id', 'online_judge', 'problem_link', 'solution', 'source']
UVa: ['id', 'online_judge', 'problem_link', 'solution', 'source']
Yosupo: ['id', 'online_judge', 'problem_link', 'solution', 'source']


### Combine the datasets

In [32]:
combined_data = pd.concat([atcoder_data, codechef_data, codeforces_data, cses_data, hackerearth_data, leetcode_data, uva_data, yosupo_data], axis=0)

In [34]:
# Drop id column
combined_data.drop('id', axis=1, inplace=True)

combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27516 entries, 0 to 194
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   problem_link     20287 non-null  object
 1   submission_link  7229 non-null   object
 2   solution         27516 non-null  object
 3   online_judge     27516 non-null  object
 4   language         229 non-null    object
 5   time             229 non-null    object
 6   memory           229 non-null    object
 7   source           6287 non-null   object
 8   problem_name     7000 non-null   object
dtypes: object(9)
memory usage: 2.1+ MB


### Reorder the columns

In [35]:
combined_data = combined_data[['online_judge', 'problem_link', 'submission_link', 'problem_name', 'source', 'language', 'time', 'memory', 'solution']]

In [36]:
combined_data.head()

Unnamed: 0,online_judge,problem_link,submission_link,problem_name,source,language,time,memory,solution
0,AtCoder,https://atcoder.jp/contests/agc001/tasks/agc00...,https://atcoder.jp/contests/agc001/submissions...,,,,,,#include <bits/stdc++.h>\r\nusing namespace st...
1,AtCoder,https://atcoder.jp/contests/agc001/tasks/agc00...,https://atcoder.jp/contests/agc001/submissions...,,,,,,// LUOGU_RID: 147675545\n#include<bits/stdc++....
2,AtCoder,https://atcoder.jp/contests/agc001/tasks/agc00...,https://atcoder.jp/contests/agc001/submissions...,,,,,,#include <bits/stdc++.h>\n\nusing namespace st...
3,AtCoder,https://atcoder.jp/contests/agc001/tasks/agc00...,https://atcoder.jp/contests/agc001/submissions...,,,,,,// LUOGU_RID: 147582298\n#include <bits/stdc++...
4,AtCoder,https://atcoder.jp/contests/agc001/tasks/agc00...,https://atcoder.jp/contests/agc001/submissions...,,,,,,// LUOGU_RID: 147550123\n#include<bits/stdc++....


### Save the combined dataset

In [37]:
combined_data.to_csv('IPC_Datasets/combined_solutions_v1.csv', index=True, index_label='index')