In [1]:
import json
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
from features.utils import build_mapping_to_ids

warnings.filterwarnings('ignore')

In [2]:
import sqlite3
import os

In [3]:
# Path to your solutions.sqlar file
db_path = 'solutions.sqlar'

# Connect to the SQLAR file
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [4]:
cursor.execute("""
    SELECT name, LENGTH(data) as data_length 
    FROM sqlar 
    WHERE name LIKE '%.java' 
    ORDER BY data_length ASC 
    LIMIT 5
""")

cursor.fetchall()

[('solutions/elizarov.1.java', 833),
 ('solutions/stolis.1.java', 931),
 ('solutions/peter707.0.java', 990),
 ('solutions/peter707.9.java', 990),
 ('solutions/elizarov.0.java', 1028)]

In [6]:
# Query to list all .java files in the archive
cursor.execute("SELECT data  FROM sqlar WHERE name LIKE '%.java'")

# Fetch all matching files
java_files = cursor.fetchall()

In [9]:
code_snippets = [row[0].decode('utf-8') for row in java_files]
code_snippets[0]

'import java.io.*;\r\nimport java.math.*;\r\nimport java.util.*;\r\nimport java.util.stream.*;\r\n\r\npublic class Solution {\r\n\r\n\tvoid solve() {\r\n\t\tn = nextInt();\r\n\t\tlong[] xs = new long[n];\r\n\t\tlong[] ys = new long[n];\r\n\t\tlong[] zs = new long[n];\r\n\t\t\r\n\t\tfor (int i = 0; i < n; i++) {\r\n\t\t\txs[i] = nextLong();\r\n\t\t\tys[i] = nextLong();\r\n\t\t\tzs[i] = nextLong();\r\n\t\t}\r\n\t\t\r\n\t\tbad = new int[n * n * n];\r\n\t\t\r\n\t\tfor (int i = 0; i < n; i++) {\r\n\t\t\tfor (int j = i + 1; j < n; j++) {\r\n\t\t\t\tfor (int k = j + 1; k < n; k++) {\r\n\t\t\t\t\t\r\n\t\t\t\t\tlong x1 = xs[j] - xs[i];\r\n\t\t\t\t\tlong y1 = ys[j] - ys[i];\r\n\t\t\t\t\tlong z1 = zs[j] - zs[i];\r\n\t\t\t\t\t\r\n\t\t\t\t\tlong x2 = xs[k] - xs[i];\r\n\t\t\t\t\tlong y2 = ys[k] - ys[i];\r\n\t\t\t\t\tlong z2 = zs[k] - zs[i];\r\n\t\t\t\t\t\r\n\t\t\t\t\tlong a = y1 * z2 - y2 * z1;\r\n\t\t\t\t\tlong b = z1 * x2 - z2 * x1;\r\n\t\t\t\t\tlong c = x1 * y2 - x2 * y1;\r\n\t\t\t\t\t\r\n\t\t\t\

In [8]:
from features import *

samples = calculate_features_for_files(code_snippets)

139


ValueError: too many values to unpack (expected 2)

In [39]:
X = build_dataset(samples)

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

Number of samples: 21
Number of features: 787


# Data

### Get all problems

In [40]:
with open('data/metadata.json', 'r') as file:
    metadata = json.load(file)

In [41]:
dataset_path = Path('data/codejam/')
files = list(dataset_path.rglob('*.java'))

In [42]:
usernames = set(it.stem for it in files)
username_to_id = build_mapping_to_ids(usernames)

In [43]:
dataset = pd.DataFrame({
    'path': files,
    'round_id': [int(it.parts[2]) for it in files],
    'problem_id': [int(it.parts[3]) for it in files],
    'user_id': [username_to_id[it.stem] for it in files]
})

In [44]:
dataset.head()

Unnamed: 0,path,round_id,problem_id,user_id


### Select 100 random users who has at least 9 files

In [45]:
N_FILES = 9
N_USERS = 100

In [46]:
random.seed(0)
np.random.seed(0)

In [47]:
count = dataset.groupby('user_id').problem_id.count()
users = count[count >= N_FILES].index
users = np.random.choice(users, N_USERS, replace=False)

ValueError: 'a' cannot be empty unless no samples are taken

In [10]:
parts = [dataset[dataset.user_id == user].sample(n=N_FILES, replace=False) for user in users]
dataset = pd.concat(parts).reset_index(drop=True)

# Create new user ids
user_id_to_new_id = build_mapping_to_ids(dataset.user_id)
dataset.user_id = dataset.user_id.apply(lambda it: user_id_to_new_id[it])

In [None]:
dataset.head()

# Build dataset

In [10]:
from features import *
from sklearn.feature_selection import mutual_info_regression

In [11]:
# Query to list all .java files in the archive
cursor.execute("SELECT data , name  FROM sqlar WHERE name LIKE '%.java'")

# Fetch all matching files
java_files = cursor.fetchall()

In [14]:
code_snippets = [(row[1], row[0].decode('utf-8')) for row in java_files]

In [20]:
calculate_features_for_files([(row[1], row[0].decode('utf-8')) for row in java_files])

139


[]

In [17]:
# Query to list all .java files in the archive
len(samples)

0

In [10]:
X = build_dataset(samples)
y = dataset.user_id.values

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

### Select the best 1500 features according to mutual information

In [None]:
mi = mutual_info_regression(np.nan_to_num(X), y, random_state=0)
mi /= np.max(mi)

In [None]:
mi_indices = np.argsort(mi)
features_indices = mi_indices[-1500:]
features = X.columns[features_indices].values
X = X[features]

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

### Select top 1500 popular features

In [None]:
nan_count = X.isna().sum(axis=0)
indices = np.argsort(nan_count.values)
features = nan_count[indices][:1500].index
X = X[features]

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

In [None]:
X.head()

# Classification

In [17]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

for index, (train_index, valid_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    model = CatBoostClassifier(
        iterations=500, 
        learning_rate=0.2,
        rsm=0.01,
        depth=3,
        bootstrap_type='Bernoulli',
        subsample=0.7,
        loss_function='MultiClass'
    )
    
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), plot=False, verbose=False)
    
    y_pred = model.predict(X_train).squeeze()
    train_acc = np.average(y_train == y_pred)

    y_pred = model.predict(X_valid).squeeze()
    valid_acc = np.average(y_valid == y_pred)

    print(f'Validation #{index + 1}')
    print(f'Train accuracy: {train_acc:.2f}')
    print(f'Valid accuracy: {valid_acc:.2f}\n')