<a href="https://colab.research.google.com/github/SergeiLab/CAPEX-vs-OPEX/blob/main/capexvsopex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

df = pd.read_csv('capex_opex_train.csv')

def clean(text):
    if pd.isna(text): return ""
    return re.sub(r'\s+', ' ', re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', ' ', str(text).lower())).strip()

df['text'] = df['task_subject'].apply(clean) + ' ' + df['work_description'].apply(clean)

tfidf = TfidfVectorizer(max_features=3000, min_df=2, max_df=0.95, ngram_range=(1,3))
X_text = tfidf.fit_transform(df['text'])

le_type = LabelEncoder()
le_status = LabelEncoder()
le_target = LabelEncoder()

df['type_enc'] = le_type.fit_transform(df['task_type'].fillna('unk'))
df['status_enc'] = le_status.fit_transform(df['task_status'].fillna('unk'))
df['hours_log'] = np.log1p(df['hours'].fillna(0))

X_add = df[['type_enc', 'status_enc', 'hours_log']].values
X = hstack([X_text, csr_matrix(X_add)])
y = le_target.fit_transform(df['target'])

X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

model = xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05, random_state=SEED)
model.fit(X_tr, y_tr)

print(f"F1: {f1_score(y_val, model.predict(X_val), average='macro'):.4f}")

model.fit(X, y)

with open('capex_opex_model.pkl', 'wb') as f:
    pickle.dump({'model': model, 'tfidf': tfidf, 'le_type': le_type, 'le_status': le_status, 'le_target': le_target, 'seed': SEED}, f)

print("Done")

In [None]:
import pandas as pd
import numpy as np
import re
import pickle
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

with open('capex_opex_model.pkl', 'rb') as f:
    m = pickle.load(f)

model, tfidf, le_type, le_status, le_target, SEED = m['model'], m['tfidf'], m['le_type'], m['le_status'], m['le_target'], m['seed']

df = pd.read_csv('capex_opex_public_test.csv')

def clean(text):
    if pd.isna(text): return ""
    return re.sub(r'\s+', ' ', re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', ' ', str(text).lower())).strip()

df['text'] = df['task_subject'].apply(clean) + ' ' + df['work_description'].apply(clean)

X_text = tfidf.transform(df['text'])

def enc(le, vals):
    return np.array([le.transform([v])[0] if v in le.classes_ else 0 for v in vals.fillna('unk')])

df['type_enc'] = enc(le_type, df['task_type'])
df['status_enc'] = enc(le_status, df['task_status'])
df['hours_log'] = np.log1p(df['hours'].fillna(0))

X = hstack([X_text, csr_matrix(df[['type_enc', 'status_enc', 'hours_log']].values)])

preds = le_target.inverse_transform(model.predict(X))

pd.DataFrame({'target': preds}).to_csv(f'sample_submission_seed{SEED}.csv', index=False)

print("Done")