In [1]:
import pandas as pd

df = pd.read_csv("../data/data.csv")

print(df.shape)
print(df.columns)
df.head()


(4112, 8)
Index(['title', 'description', 'input_description', 'output_description',
       'sample_io', 'problem_class', 'problem_score', 'url'],
      dtype='object')


Unnamed: 0,title,description,input_description,output_description,sample_io,problem_class,problem_score,url
0,Uuu,Unununium (Uuu) was the name of the chemical\n...,The input consists of one line with two intege...,The output consists of $M$ lines where the $i$...,"[{'input': '7 10', 'output': '1 2\n2 3\n1 3\n3...",hard,9.7,https://open.kattis.com/problems/uuu
1,House Building,A number of eccentrics from central New York h...,"The input consists of $10$ test cases, which a...",Print $K$ lines with\n the positions of the...,"[{'input': '0\n2 3 2\n50 60 50\n30 50 40', 'ou...",hard,9.7,https://open.kattis.com/problems/husbygge
2,Mario or Luigi,Mario and Luigi are playing a game where they ...,,,"[{'input': '', 'output': ''}]",hard,9.6,https://open.kattis.com/problems/marioorluigi
3,The Wire Ghost,Žofka is bending a copper wire. She starts wit...,The first line contains two integers $L$ and $...,The output consists of a single line consistin...,"[{'input': '4 3\n3 C\n2 C\n1 C', 'output': 'GH...",hard,9.6,https://open.kattis.com/problems/thewireghost
4,Barking Up The Wrong Tree,"Your dog Spot is let loose in the park. Well, ...",The first line of input consists of two intege...,Write a single line containing the length need...,"[{'input': '2 0\n10 0\n10 10', 'output': '14.1...",hard,9.6,https://open.kattis.com/problems/barktree


In [2]:
df.isnull().sum()


title                   0
description            81
input_description     120
output_description    131
sample_io               0
problem_class           0
problem_score           0
url                     0
dtype: int64

In [3]:
import pandas as pd

df = pd.read_csv("../data/data.csv")

print(df.columns)
print(df.head(3))
print(df.isna().sum())


Index(['title', 'description', 'input_description', 'output_description',
       'sample_io', 'problem_class', 'problem_score', 'url'],
      dtype='object')
            title                                        description  \
0             Uuu  Unununium (Uuu) was the name of the chemical\n...   
1  House Building  A number of eccentrics from central New York h...   
2  Mario or Luigi  Mario and Luigi are playing a game where they ...   

                                   input_description  \
0  The input consists of one line with two intege...   
1  The input consists of $10$ test cases, which a...   
2                                                NaN   

                                  output_description  \
0  The output consists of $M$ lines where the $i$...   
1  Print $K$ lines with\n    the positions of the...   
2                                                NaN   

                                           sample_io problem_class  \
0  [{'input': '7 10', 'output': '

In [4]:
text_cols = [
    "title",
    "description",
    "input_description",
    "output_description"
]

for col in text_cols:
    df[col] = df[col].fillna("")


In [5]:
df["combined_text"] = (
    df["title"] + " " +
    df["description"] + " " +
    df["input_description"] + " " +
    df["output_description"]
)


In [6]:
import re

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-z0-9+\-*/%=<> ]", "", text)
    return text.strip()


In [14]:
df["combined_text"] = df["combined_text"].apply(clean_text)


In [16]:
df[["combined_text", "problem_class", "problem_score"]].head()


Unnamed: 0,combined_text,problem_class,problem_score
0,uuu unununium uuu was the name of the chemical...,hard,9.7
1,house building a number of eccentrics from cen...,hard,9.7
2,mario or luigi mario and luigi are playing a g...,hard,9.6
3,the wire ghost ofka is bending a copper wire s...,hard,9.6
4,barking up the wrong tree your dog spot is let...,hard,9.6


#Feature Engineering

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_tfidf = tfidf.fit_transform(df["combined_text"])


In [19]:
df["text_length"] = df["combined_text"].apply(len)


In [20]:
df["line_count"] = df["combined_text"].apply(lambda x: x.count(" "))


In [23]:
math_symbols = r"[+\-*/%=<>]"

df["math_symbol_count"] = df["combined_text"].apply(
    lambda x: len(re.findall(math_symbols, x))
)


In [24]:
keywords = [
    "dp", "dynamic programming", "graph", "tree", "dfs", "bfs",
    "recursion", "bitmask", "segment tree", "flow", "greedy"
]

for kw in keywords:
    df[f"kw_{kw.replace(' ', '_')}"] = df["combined_text"].apply(
        lambda x: x.count(kw)
    )


In [25]:
from scipy.sparse import hstack

X_numeric = df[
    ["text_length", "line_count", "math_symbol_count"] +
    [f"kw_{kw.replace(' ', '_')}" for kw in keywords]
].values

X = hstack([X_tfidf, X_numeric])


In [26]:
y_class = df["problem_class"]    # Easy / Medium / Hard
y_score = df["problem_score"]    # numerical


Classification Model


In [28]:
from sklearn.model_selection import train_test_split

X_text = df["combined_text"]
y_class = df["problem_class"]

X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_text,
    y_class,
    test_size=0.2,
    random_state=42,
    stratify=y_class
)


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_tfidf_train = tfidf.fit_transform(X_text_train)
X_tfidf_test  = tfidf.transform(X_text_test)


In [30]:
import numpy as np
import re

def extract_numeric_features(text_series):
    text_length = text_series.apply(len)
    word_count = text_series.apply(lambda x: x.count(" "))
    math_symbol_count = text_series.apply(
        lambda x: len(re.findall(r"[+\-*/%=<>]", x))
    )

    keywords = [
        "dp", "graph", "tree", "dfs", "bfs",
        "recursion", "bitmask", "greedy", "flow"
    ]

    keyword_features = []
    for kw in keywords:
        keyword_features.append(text_series.apply(lambda x: x.count(kw)))

    return np.column_stack(
        [text_length, word_count, math_symbol_count] +
        keyword_features
    )


In [31]:
X_num_train = extract_numeric_features(X_text_train)
X_num_test  = extract_numeric_features(X_text_test)


In [32]:
from scipy.sparse import hstack

X_train = hstack([X_tfidf_train, X_num_train])
X_test  = hstack([X_tfidf_test,  X_num_test])


In [33]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    class_weight="balanced"
)

clf.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [34]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("Classification Accuracy:", acc)


Classification Accuracy: 0.3888213851761847


In [35]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[108  20  25]
 [133 143 113]
 [120  92  69]]
              precision    recall  f1-score   support

        easy       0.30      0.71      0.42       153
        hard       0.56      0.37      0.44       389
      medium       0.33      0.25      0.28       281

    accuracy                           0.39       823
   macro avg       0.40      0.44      0.38       823
weighted avg       0.43      0.39      0.38       823



In [36]:
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

scaler = StandardScaler(with_mean=False)

X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled  = scaler.transform(X_num_test)

X_train = hstack([X_tfidf_train, X_num_train_scaled])
X_test  = hstack([X_tfidf_test,  X_num_test_scaled])


In [37]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(
    C=0.5,
    class_weight="balanced",
    max_iter=5000,
    random_state=42
)

svm_clf.fit(X_train, y_train)




0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,0.5
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred_svm = svm_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


Accuracy: 0.5006075334143378
[[ 71  41  41]
 [ 52 246  91]
 [ 50 136  95]]
              precision    recall  f1-score   support

        easy       0.41      0.46      0.44       153
        hard       0.58      0.63      0.61       389
      medium       0.42      0.34      0.37       281

    accuracy                           0.50       823
   macro avg       0.47      0.48      0.47       823
weighted avg       0.49      0.50      0.50       823



In [39]:
import numpy as np

feature_names = tfidf.get_feature_names_out()
coef = svm_clf.coef_

for i, cls in enumerate(svm_clf.classes_):
    top = np.argsort(coef[i])[-15:]
    print(f"\nTop words for {cls}:")
    print(feature_names[top])



Top words for easy:
['symbols' 'hn' 'forward' 'pub' 'crackers' 'brownie' 'ingredients'
 'problems' 'read' 'assume' 'world' 'characters' 'consists single'
 'bottles' 'single']

Top words for hard:
['consists output' 'routes' 'non' 'drones' 'sections' 'modulo' 'string'
 'arbitrary' 'possible' 'cat' 'divisors' 'queries' 'exists' 'fence'
 'takes']

Top words for medium:
['vehicles' 'reservoir' 'players' 'gate' 'legal' 'different' 'walls'
 'colour' 'column' 'bi' 'chikapu' 'divisible' 'win' 'stock' 'change']


In [40]:
X_num = extract_numeric_features(df["combined_text"])
y = df["problem_class"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_num, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)



In [41]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_leaf=5,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [42]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.40097205346294046
[[ 92  40  21]
 [121 188  80]
 [102 129  50]]
              precision    recall  f1-score   support

        easy       0.29      0.60      0.39       153
        hard       0.53      0.48      0.50       389
      medium       0.33      0.18      0.23       281

    accuracy                           0.40       823
   macro avg       0.38      0.42      0.38       823
weighted avg       0.42      0.40      0.39       823



In [43]:
import pandas as pd

feature_names = [
    "text_length", "word_count", "math_symbol_count",
    "kw_dp", "kw_graph", "kw_tree", "kw_dfs", "kw_bfs",
    "kw_recursion", "kw_bitmask", "kw_greedy", "kw_flow"
]

importance = pd.Series(
    rf.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

print(importance)


text_length          0.375862
word_count           0.353126
math_symbol_count    0.143724
kw_graph             0.050954
kw_tree              0.047239
kw_dp                0.013474
kw_flow              0.011536
kw_greedy            0.004086
kw_bfs               0.000000
kw_dfs               0.000000
kw_bitmask           0.000000
kw_recursion         0.000000
dtype: float64


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_tfidf_train = tfidf.fit_transform(X_text_train)
X_tfidf_test  = tfidf.transform(X_text_test)


In [45]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(
    n_components=300,   # 200–500 is ideal
    random_state=42
)

X_tfidf_train_reduced = svd.fit_transform(X_tfidf_train)
X_tfidf_test_reduced  = svd.transform(X_tfidf_test)


In [46]:
X_num_train = extract_numeric_features(X_text_train)
X_num_test  = extract_numeric_features(X_text_test)


In [50]:
import numpy as np

X_train_all = np.hstack([X_tfidf_train_reduced, X_num_train])
X_test_all  = np.hstack([X_tfidf_test_reduced,  X_num_test])


In [55]:
from sklearn.ensemble import RandomForestClassifier

rf_all = RandomForestClassifier(
    n_estimators=500,
    max_depth=25,
    min_samples_leaf=3,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf_all.fit(X_train_all, y_train)


0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,25
,min_samples_split,2
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [57]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = rf_all.predict(X_test_all)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5164034021871203
[[ 31  98  24]
 [  7 365  17]
 [ 13 239  29]]
              precision    recall  f1-score   support

        easy       0.61      0.20      0.30       153
        hard       0.52      0.94      0.67       389
      medium       0.41      0.10      0.17       281

    accuracy                           0.52       823
   macro avg       0.51      0.41      0.38       823
weighted avg       0.50      0.52      0.43       823



In [59]:
from sklearn.model_selection import train_test_split

X_text = df["combined_text"]
y_score = df["problem_score"]

X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_text,
    y_score,
    test_size=0.2,
    random_state=42
)


In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_tfidf_train = tfidf.fit_transform(X_text_train)
X_tfidf_test  = tfidf.transform(X_text_test)


In [61]:
X_num_train = extract_numeric_features(X_text_train)
X_num_test  = extract_numeric_features(X_text_test)


In [62]:
from scipy.sparse import hstack

X_train = hstack([X_tfidf_train, X_num_train])
X_test  = hstack([X_tfidf_test,  X_num_test])


In [63]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [64]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

y_pred = lr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("MAE :", mae)
print("RMSE:", rmse)


MAE : 1.9215117595078104
RMSE: 2.404874644836902


In [69]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(
    n_estimators=400,
    max_depth=20,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

rf_reg.fit(X_train, y_train)


0,1,2
,n_estimators,400
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [70]:
y_pred_rf = rf_reg.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("RF MAE :", mae_rf)
print("RF RMSE:", rmse_rf)


RF MAE : 1.7044355329853023
RF RMSE: 2.0467033368688297


In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_tfidf_train = tfidf.fit_transform(X_text_train)
X_tfidf_test  = tfidf.transform(X_text_test)


In [72]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(
    n_components=300,
    random_state=42
)

X_tfidf_train_red = svd.fit_transform(X_tfidf_train)
X_tfidf_test_red  = svd.transform(X_tfidf_test)


In [73]:
X_num_train = extract_numeric_features(X_text_train)
X_num_test  = extract_numeric_features(X_text_test)


In [74]:
import numpy as np

X_train = np.hstack([X_tfidf_train_red, X_num_train])
X_test  = np.hstack([X_tfidf_test_red,  X_num_test])


In [75]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    random_state=42
)

gbr.fit(X_train, y_train)


0,1,2
,loss,'squared_error'
,learning_rate,0.05
,n_estimators,300
,subsample,0.8
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,4
,min_impurity_decrease,0.0


In [76]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

y_pred = gbr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("GBR MAE :", mae)
print("GBR RMSE:", rmse)


GBR MAE : 1.6995694591554906
GBR RMSE: 2.052194397933899


In [77]:
import pandas as pd

importance = pd.Series(
    gbr.feature_importances_
).sort_values(ascending=False)

print(importance.head(15))


301    0.050993
300    0.026273
2      0.026172
11     0.012975
6      0.012554
32     0.010290
12     0.010210
30     0.009527
105    0.009383
135    0.009068
98     0.008711
44     0.008590
7      0.007925
72     0.007801
120    0.007505
dtype: float64


In [78]:
import joblib

joblib.dump(tfidf, "tfidf.pkl")
joblib.dump(svd, "svd.pkl")
joblib.dump(svm_clf, "svm_classifier.pkl")
joblib.dump(gbr, "gbr_regressor.pkl")



['gbr_regressor.pkl']

In [79]:
import sklearn, sys
print("Notebook sklearn:", sklearn.__version__)
print("Notebook python:", sys.executable)


Notebook sklearn: 1.7.0
Notebook python: C:\Users\mayan\AppData\Local\Programs\Python\Python313\python.exe
