In [1]:
import json

import torch
import numpy  as np
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from src.commentsProcessing import CommentsPreparation
from src.employeesProcessing import EmployeesPreparation
from src.baseProcessing import createTimeFeatures, createAuxiliaryFeatures, inverseBoxCox, Embeddings, frequencyTables,\
diffFeaturesByGroups, diffFeatures, posFreqEstimation, coeffPositionsInProjects, validateModel, lemmatizeSummary

In [2]:
with open("links.json", "r") as file:
    links = json.load(file)
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[device]: {device}")    
    
del file

[device]: cuda


## GENERAL

In [3]:
dfEmp = EmployeesPreparation(links["auxiliary"]["employees"], naLimit=0.5).apply()
print(f"[dfEmp] : {dfEmp.shape}")

['salary_calculation_type', 'english_level', 'full_name', 'passport'] were dropped
[dfEmp] : (343, 4)


## TRAIN

In [4]:
byAuthor, byIssue, _ = CommentsPreparation(links["train"]["comments"]).apply()
print(f"[byAuthor] : {byAuthor.shape} \n[byIssue]: {byIssue.shape}")

[byAuthor] : (52, 9) 
[byIssue]: (3945, 2)


In [5]:
dfTrainIssues = pd.read_csv(links["train"]["issues"])
dfTrainIssues["summary"] = dfTrainIssues["summary"].str.lower()
dfTrainIssues["overall_worklogs"] = np.log1p(dfTrainIssues["overall_worklogs"])
print(f"[dfTrainIssues]: {dfTrainIssues.shape}")

[dfTrainIssues]: (9589, 8)


In [6]:
dfTrainIssues = lemmatizeSummary(dfTrainIssues)
dfTrainIssues = dfTrainIssues.merge(byAuthor, on="assignee_id", how="left").fillna(0)
dfTrainIssues = dfTrainIssues.merge(byIssue, on="id", how="left").fillna(0)
dfTrainIssues = dfTrainIssues.merge(dfEmp.rename(columns={"id": "assignee_id"}), on="assignee_id", how="left")
dfTrainIssues = coeffPositionsInProjects(dfTrainIssues)
dfTrainIssues = posFreqEstimation(dfTrainIssues)
dfTrainIssues = createTimeFeatures(dfTrainIssues)
dfTrainIssues = frequencyTables(dfTrainIssues)
dfTrainIssues = diffFeaturesByGroups(dfTrainIssues)
dfTrainIssues = diffFeatures(dfTrainIssues)
dfTrainIssues = createAuxiliaryFeatures(dfTrainIssues)
print(dfTrainIssues.shape)

del byAuthor, byIssue

9589it [06:20, 25.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:27<00:00,  3.87s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:13<00:00,  1.97s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:08<00:00,  1.18s/it]

(9589, 58)





In [7]:
objectCols = [col for col in list(dfTrainIssues) if dfTrainIssues[col].dtype == "object"]
objectCols.remove("summary_l")
objectCols.append("created")
print(objectCols)

dfTrainIssues.drop(objectCols, axis=1, inplace=True)
print(dfTrainIssues.shape)

del objectCols

['summary', 'pos', 'time', 'created']
(9589, 54)


In [8]:
dfTestIssues = pd.read_csv(links["test"]["issues"])
dfTestIssues["summary"] = dfTestIssues["summary"].str.lower()
dfTestIssues = lemmatizeSummary(dfTestIssues)

vec = pd.concat([dfTrainIssues["summary_l"].to_frame(), dfTestIssues["summary_l"].to_frame()]).reset_index(drop=True)
print(vec.shape)

del dfTestIssues

1070it [00:12, 88.10it/s] 

(10659, 1)





In [9]:
vectorizer = TfidfVectorizer(lowercase=False, ngram_range=(1,3), min_df=0.001, max_df=1.0, dtype="float32", binary=False,
                            stop_words={"is", "by", "for", "from", "in", "the", "to", "of", "on", "do", "does", "has",
                                        "to", "на", "по", "для", "with", "and", "into", "have", "our", "was", "we",
                                        "my", "an", "about", "are", "as", "at", "be", "can" "you", "your", "за", "из",
                                        "от", "со", "a", "or", "don't", "в", "и"})
tfidf = vectorizer.fit_transform(vec["summary_l"])
tfidfDf = pd.DataFrame(columns=vectorizer.get_feature_names(), data=tfidf.toarray())
tfidfDf.drop([col for col in list(tfidfDf) if col[0].isdigit()], axis=1, inplace=True)
tfidfDf["id"] = list(range(10659))
tfidfDf.shape

(10659, 1073)

In [10]:
dfTrainIssues = dfTrainIssues.merge(tfidfDf[:9589], on="id", how="left").fillna(0)
print(dfTrainIssues.shape)
print(dfTrainIssues.isna().sum().sum())

dfTrainIssues.drop([col for col in list(dfTrainIssues) if dfTrainIssues[col].dtype == "object"], axis=1, inplace=True)
dfTrainIssues.drop(["id"], axis=1, inplace=True)

(9589, 1126)
0


In [11]:
# LANG MODELS

dfTrainLang = pd.read_csv(links["train"]["issues"])
dfTrainLang["summary"] = dfTrainLang["summary"].str.lower()
dfTrainLang["overall_worklogs"] = np.log1p(dfTrainLang["overall_worklogs"])

langModels = ["cointegrated/rubert-tiny2", "cointegrated/LaBSE-en-ru"]

for model in langModels:
    dfTrainLang = dfTrainLang.join(Embeddings(dfTrainLang, model, "summary", device).apply())
    
dfTrainLang.drop(["created", "key", "summary", "id"], axis=1, inplace=True)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████

<h1>MODEL VALIDATION</h1>

In [13]:
model = CatBoostRegressor(iterations=1000, random_seed=42, task_type="CPU",
                          eval_metric="R2", use_best_model=False, cat_features=["project_id", "assignee_id", "creator_id"])
validateModel(model, dfTrainIssues, "overall_worklogs", fitParams={"early_stopping_rounds": 100, "verbose": 0},
              nFold=5, KFold_randomState=42)

[X]: (9589, 1122) 
[y]: (9589,)
[0]: 0.24052749712426935
[1]: 0.24485025676406413
[2]: 0.25844370309613995
[3]: 0.25834471026050554
[4]: 0.2426581043226791
[mean]: 0.2489648543135316


In [14]:
# LANG MODELS
model = CatBoostRegressor(iterations=1000, random_seed=42, task_type="CPU",
                          eval_metric="R2", use_best_model=False, cat_features=["project_id", "assignee_id", "creator_id"])
validateModel(model, dfTrainLang, "overall_worklogs", fitParams={"early_stopping_rounds": 100, "verbose": 0},
              nFold=5, KFold_randomState=42)

[X]: (9589, 1083) 
[y]: (9589,)
[0]: 0.23732914708753583
[1]: 0.22623000047222286
[2]: 0.23400867015214166
[3]: 0.20958566475528617
[4]: 0.2565743442720856
[mean]: 0.23274556534785443


## TEST

In [12]:
byAuthorTest, byIssueTest, _ = CommentsPreparation(links["test"]["comments"]).apply()
print(f"[byAuthor] : {byAuthorTest.shape} \n[byIssue]: {byIssueTest.shape}")

[byAuthor] : (27, 9) 
[byIssue]: (479, 2)


In [13]:
dfTestIssues = pd.read_csv(links["test"]["issues"])
dfTestIssues["summary"] = dfTestIssues["summary"].str.lower()
print(f"[dfTestIssues]: {dfTestIssues.shape}")

dfId = dfTestIssues["id"].to_frame()

[dfTestIssues]: (1070, 7)


In [14]:
dfTestIssues = lemmatizeSummary(dfTestIssues)
dfTestIssues = dfTestIssues.merge(byAuthorTest, on="assignee_id", how="left").fillna(0)
dfTestIssues = dfTestIssues.merge(byIssueTest, on="id", how="left").fillna(0)
dfTestIssues = dfTestIssues.merge(dfEmp.rename(columns={"id": "assignee_id"}), on="assignee_id", how="left")
dfTestIssues = coeffPositionsInProjects(dfTestIssues)
dfTestIssues = posFreqEstimation(dfTestIssues)
dfTestIssues = createTimeFeatures(dfTestIssues)
dfTestIssues = frequencyTables(dfTestIssues)
dfTestIssues = diffFeaturesByGroups(dfTestIssues)
dfTestIssues = diffFeatures(dfTestIssues)
dfTestIssues = createAuxiliaryFeatures(dfTestIssues)
print(dfTestIssues.shape)

del byAuthorTest, byIssueTest

1070it [00:11, 93.98it/s] 
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  4.66it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  4.64it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  6.65it/s]

(1070, 57)





In [15]:
objectColsTest = [col for col in list(dfTestIssues) if dfTestIssues[col].dtype == "object"]
objectColsTest.remove("summary_l")
objectColsTest.append("created")
print(objectColsTest)

dfTestIssues.drop(objectColsTest, axis=1, inplace=True)
print(dfTestIssues.shape)

del objectColsTest

['summary', 'pos', 'time', 'created']
(1070, 53)


In [16]:
dfTestIssues = dfTestIssues.merge(tfidfDf[9589:], on="id", how="left").fillna(0)
print(dfTestIssues.shape)
print(dfTestIssues.isna().sum().sum())

dfTestIssues.drop([col for col in list(dfTestIssues) if dfTestIssues[col].dtype == "object"], axis=1, inplace=True)
dfTestIssues.drop(["id"], axis=1, inplace=True)

(1070, 1125)
0


In [17]:
# LANG MODELS

dfTestLang = pd.read_csv(links["test"]["issues"])
dfTestLang["summary"] = dfTestLang["summary"].str.lower()

langModels = ["cointegrated/rubert-tiny2", "cointegrated/LaBSE-en-ru"]

for model in langModels:
    dfTestLang = dfTestLang.join(Embeddings(dfTestLang, model, "summary", device).apply())

dfTestLang.drop(["created", "key", "id"], axis=1, inplace=True)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████

## SUBMIT GENERATING

In [18]:
train, valid = train_test_split(dfTrainIssues, test_size=0.25, shuffle=True, random_state=42)
y_train, y_valid = train["overall_worklogs"], valid["overall_worklogs"]
train.drop(["overall_worklogs"], axis=1, inplace=True)
valid.drop(["overall_worklogs"], axis=1, inplace=True)
allCols = list(train)

dfTestIssues = dfTestIssues[allCols]
print(dfTestIssues.shape)

for _ in range(7):
    randomState = np.random.choice(list(range(1000)))
    
    cb = CatBoostRegressor(iterations=1000, random_seed=randomState, task_type="CPU",
                                   eval_metric="R2", use_best_model=True, cat_features=["project_id", "assignee_id",
                                                                                        "creator_id"])
    cb.fit(train, y_train, eval_set=[(valid, y_valid)], early_stopping_rounds=100, verbose=0)
    
    r2 = r2_score(y_valid, cb.predict(valid))
    print(f"[({randomState}) r2]: {r2}")
    
    dfId[f"pred_rs_{randomState}"] = np.exp(cb.predict(dfTestIssues)) + 1

(1070, 1123)
[(766) r2]: 0.2399940768259865
[(979) r2]: 0.2422871073023063
[(269) r2]: 0.24328626341855686
[(970) r2]: 0.23751384108076357
[(997) r2]: 0.24606253671141665
[(57) r2]: 0.2423468987735531
[(785) r2]: 0.24510828130004303


In [19]:
RsColumns = dfId.columns[1:]
dfId["overall_worklogs"] = dfId[RsColumns].mean(axis=1)
dfId = dfId[["id", "overall_worklogs"]]
dfId.head(5)

Unnamed: 0,id,overall_worklogs
0,675975,18086.276912
1,675972,14303.807374
2,675965,7114.926681
3,675961,15645.837888
4,675955,9209.793517


In [20]:
# LANG MODELS
dfIdL = dfId.copy()

trainL, validL = train_test_split(dfTrainLang, test_size=0.25, shuffle=True, random_state=42)
y_trainL, y_validL = trainL["overall_worklogs"], validL["overall_worklogs"]
trainL.drop(["overall_worklogs"], axis=1, inplace=True)
validL.drop(["overall_worklogs"], axis=1, inplace=True)
allColsL = list(trainL)
len(allColsL)

dfTestLang = dfTestLang[allColsL]
print(dfTestLang.shape)

for _ in range(7):
    randomState = np.random.choice(list(range(1000)))
    
    cb = CatBoostRegressor(iterations=1000, random_seed=randomState, task_type="CPU",
                                   eval_metric="R2", use_best_model=True, cat_features=["project_id", "assignee_id",
                                                                                        "creator_id"])
    cb.fit(trainL, y_trainL, eval_set=[(validL, y_validL)], early_stopping_rounds=100, verbose=0)
    
    r2L = r2_score(y_validL, cb.predict(validL))
    print(f"[({randomState}) r2]: {r2L}")
    
    dfIdL[f"pred_rs_{randomState}"] = np.exp(cb.predict(dfTestLang)) + 1

(1070, 1083)
[(138) r2]: 0.23121337032741762
[(173) r2]: 0.22160960006155717
[(902) r2]: 0.22633459430621827
[(207) r2]: 0.21121634354664076
[(801) r2]: 0.23296610361865955
[(326) r2]: 0.2378746287345398
[(743) r2]: 0.23583660637297543


In [21]:
RsColumnsL = dfIdL.columns[1:]
dfIdL["overall_worklogs"] = dfIdL[RsColumnsL].mean(axis=1)
dfIdL = dfIdL[["id", "overall_worklogs"]]
dfIdL.head(5)

Unnamed: 0,id,overall_worklogs
0,675975,12751.547923
1,675972,7357.738941
2,675965,8856.748984
3,675961,15916.32255
4,675955,10427.003697


In [22]:
finDf = dfId.merge(dfIdL, on="id")
finDf["overall_worklogs"] = finDf[["overall_worklogs_x", "overall_worklogs_y"]].mean(axis=1)
finDf["overall_worklogs"] = finDf["overall_worklogs"] * 1.55
finDf[["id", "overall_worklogs"]].to_csv("finSol_f0.csv", index=False)

----