<a href="https://colab.research.google.com/github/SergeiLab/CAPEX-vs-OPEX/blob/main/capexvsopex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import joblib

train = pd.read_csv('capex_opex_train.csv')

train['task_subject'] = train['task_subject'].fillna('')
train['work_description'] = train['work_description'].fillna('')
train['task_type'] = train['task_type'].fillna('UNKNOWN')
train['task_status'] = train['task_status'].fillna('UNKNOWN')

train['text'] = train['task_subject'] + ' ' + train['work_description']

le_type = LabelEncoder()
le_status = LabelEncoder()
le_target = LabelEncoder()

train['task_type_enc'] = le_type.fit_transform(train['task_type'])
train['task_status_enc'] = le_status.fit_transform(train['task_status'])
train['target_enc'] = le_target.fit_transform(train['target'])

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_text = tfidf.fit_transform(train['text']).toarray()

X_meta = train[['hours', 'task_type_enc', 'task_status_enc']].values

X = np.hstack([X_text, X_meta])
y = train['target_enc'].values

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    random_state=42,
    class_weight='balanced'
)
model.fit(X, y)

joblib.dump(model, 'model.pkl')
joblib.dump(tfidf, 'tfidf.pkl')
joblib.dump(le_type, 'le_type.pkl')
joblib.dump(le_status, 'le_status.pkl')
joblib.dump(le_target, 'le_target.pkl')

print("ML is Over.")

✅ Модель обучена и сохранена.


In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import joblib

test = pd.read_csv('capex_opex_public_test.csv')

test['task_subject'] = test['task_subject'].fillna('')
test['work_description'] = test['work_description'].fillna('')
test['task_type'] = test['task_type'].fillna('UNKNOWN')
test['task_status'] = test['task_status'].fillna('UNKNOWN')

test['text'] = test['task_subject'] + ' ' + test['work_description']

tfidf = joblib.load('tfidf.pkl')
le_type = joblib.load('le_type.pkl')
le_status = joblib.load('le_status.pkl')
le_target = joblib.load('le_target.pkl')
model = joblib.load('model.pkl')

if 'UNKNOWN' not in le_type.classes_:
    le_type.classes_ = np.append(le_type.classes_, 'UNKNOWN')
if 'UNKNOWN' not in le_status.classes_:
    le_status.classes_ = np.append(le_status.classes_, 'UNKNOWN')

def safe_transform(series, encoder):
    known_labels = set(encoder.classes_)
    series_clean = series.map(lambda x: x if x in known_labels else 'UNKNOWN')
    return encoder.transform(series_clean)

test['task_type_enc'] = safe_transform(test['task_type'], le_type)
test['task_status_enc'] = safe_transform(test['task_status'], le_status)

X_text = tfidf.transform(test['text']).toarray()
X_meta = test[['hours', 'task_type_enc', 'task_status_enc']].values
X = np.hstack([X_text, X_meta])

y_pred_enc = model.predict(X)
y_pred = le_target.inverse_transform(y_pred_enc)

submission = pd.DataFrame({'target': y_pred})
submission.to_csv('submission_seed42.csv', index=False)

print("Look at the submission_seed42.csv")

✅ Предсказания успешно сохранены в submission_seed42.csv
