In [1]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder  

In [2]:
try:
    df = pd.read_json('problems_data.jsonl', lines=True)
except ValueError:
    print("Error: check filetype again")
    exit()

In [3]:
df = df.dropna(subset=['problem_score', 'problem_class'])
le = LabelEncoder()
df['difficulty_label'] = le.fit_transform(df['problem_class'])
class_mapping = {k: int(v) for k, v in zip(le.classes_, le.transform(le.classes_))} #int(v) so int64 doesn't come
print("Mapping:", class_mapping) #yo remember hard is 1
text_columns = ['title', 'description', 'input_description', 'output_description']
for col in text_columns:
    df[col] = df[col].fillna('')

df['combined_text'] = (
    df['title'] + " " + 
    df['description'] + " " + 
    df['input_description'] + " " + 
    df['output_description']
)

def upd(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = " ".join(text.split())
    return text

df['combined_text'] = df['combined_text'].apply(upd)

Mapping: {'easy': 0, 'hard': 1, 'medium': 2}


In [6]:
hard_topics = [
    'convex hull', 'mobius', 'segment tree', 'flow', 'centroid', 
    'geometry', 'gcd', 'subarray', 'mex', 'dynamic programming', 
    'modulo', 'bitwise', 'graph', 'expected value','permutations','xor'
]
for j in range(len(hard_topics)):
    topic = hard_topics[j]
    col_name = f'has_{topic.replace(" ", "_")}'
    df[col_name] = df['combined_text'].str.contains(topic, case=False).astype(int)  

df['hard_topic_count'] = 0
for j in range(len(hard_topics)):
    topic = hard_topics[j]
    df['hard_topic_count'] += df['combined_text'].str.contains(topic, case=False).astype(int)
    
                                                                            
df['is_short_statement'] = (df['combined_text'].str.len() < 300).astype(int)
df['text_len'] = df['combined_text'].apply(len)
df['has_high_constraints'] = df['combined_text'].str.contains(r'10\^5|10\^9|1000000007|1e9|1e5', regex=True).astype(int)