# PRIMO 2022 ML Homework 4<br>
### Estimating the issue priority in the bug tracker

Задача: предсказать, имеет ли проблема высокий приоритет, учитывая ее описание и метаданные.<br>
Данные: взяты с сайта youtrack.jetbrains.com.<br>
Оценка качества: F1 мера

Метрика оценки качества для соревнования: F1 мера для класса 1.<br>
<br>
**Формат файла с предсказаниями<br>**
Для каждой задачи в наборе test итоговый файл с предсказаниями должен содержать два значения: id (id задачи, взятый из столбца `id` в наборе данных test) и предсказание приоритета задачи (1, если приоритет высокий, или 0, если приоритет низкий).<br>
<br>
Файл должен иметь заголовок и следующий формат:<br>
<br>
id,is_high_priority<br>
0,1<br>
1,1<br>
2,1<br>

In [186]:
import pandas as pd
import numpy as np

import warnings
import json
import operator

from tqdm import tqdm

from sklearn.model_selection import train_test_split


from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
#np.set_printoptions(precision=3)
#pd.set_option('precision', 3)



In [187]:
test_data = pd.read_csv('test.csv', index_col='id')

In [188]:
train_data = pd.read_csv('train.csv', index_col='id')


In [189]:
train_data

Unnamed: 0_level_0,summary,description,reporter,created,customFields,links,is_high_priority
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25-60681,Don't sync font size within IDEA server plugin,At home I have 1900x1200 resolution and at my ...,"{""login"": ""machak"", ""$type"": ""User""}",1231150644000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-60682,Cancelling subversion update,When cancelling an update from ie. subversion ...,"{""login"": ""sprice"", ""$type"": ""User""}",1231150705000,"[{""value"": {""name"": ""Usability Problem"", ""$typ...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-60686,Suspended Breakpoint in JVM not Recognized in ...,In some breakpoint cases for a app launched fr...,"{""login"": ""brigham"", ""$type"": ""User""}",1231183948000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-60691,Good code red: IDEA incorrectly resolves neste...,The relevant snippet if part of the UIDebug cl...,"{""login"": ""xduke"", ""$type"": ""User""}",1231241109000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-60679,Module WIll Not Load Jar Dependencies on Intre...,I have IntelliJ 7.0.5 running on both WIndows ...,"{""login"": ""stonemack"", ""$type"": ""User""}",1231133633000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
...,...,...,...,...,...,...,...
25-2507313,Version Control Incoming tab missing search fi...,"The Version Control tab for ""Incoming"" doesn't...","{""login"": ""markhodgson"", ""$type"": ""User""}",1569492791894,"[{""value"": {""name"": ""Usability Problem"", ""$typ...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-2507159,Groovy compiler can't find files with names co...,1. Create a new Project with groovy support\n2...,"{""login"": ""ted.lundqvist"", ""$type"": ""User""}",1569488444807,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-2507096,auto close stepped in files on debug resume,"Hi,\n\none coworker switched from eclipse to I...","{""login"": ""alain57"", ""$type"": ""User""}",1569486640163,"[{""value"": {""name"": ""Feature"", ""$type"": ""EnumB...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-2506900,Keymaps don't resync after disabled settings s...,I had to disable setting sync so I could expor...,"{""login"": ""Dmi3se"", ""$type"": ""User""}",1569452238504,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False


In [190]:
train_data.summary.fillna('', inplace=True)
train_data.description.fillna('', inplace=True)


In [191]:
json_fields = pd.json_normalize(train_data.customFields.map(json.loads).map(lambda x: {field['name']: field['value'] for field in x}))
json_fields.set_index(train_data.index, inplace=True)
json_fields.fillna('',inplace=True)



In [192]:
json_fields['Subsystem.name'].value_counts()

                                     22587
User Interface                        6840
Editor. Editing Text                  4038
Lang. Flash and Flex                  3516
Build. Maven                          3420
                                     ...  
No Subsystem                             1
User Interface. Search Everywhere        1
Core. Licensing                          1
Frameworks. Micronaut                    1
User Interface. Plugins                  1
Name: Subsystem.name, Length: 177, dtype: int64

In [193]:
json_columns = ['Type.name','State.name','Subsystem.name']
train_data = train_data.join(json_fields[json_columns], how='outer', lsuffix='_left')

train_data['reporter_name'] = train_data.reporter.map(json.loads).map(operator.itemgetter('login'))
train_data['Year'] = train_data.created.apply(lambda c: pd.to_datetime(c, unit='ms').year)
train_data['Month'] = train_data.created.apply(lambda c: pd.to_datetime(c, unit='ms').month)
train_data['Day'] = train_data.created.apply(lambda c: pd.to_datetime(c, unit='ms').day)
train_data['Hour'] = train_data.created.apply(lambda c: pd.to_datetime(c, unit='ms').hour)
prep_train_data = train_data.drop(labels=['reporter','created','customFields','links'], axis=1)

In [194]:
prep_train_data

Unnamed: 0_level_0,summary,description,is_high_priority,Type.name,State.name,Subsystem.name,reporter_name,Year,Month,Day,Hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
25-60681,Don't sync font size within IDEA server plugin,At home I have 1900x1200 resolution and at my ...,False,Bug,Obsolete,Core. Project Settings,machak,2009,1,5,10
25-60682,Cancelling subversion update,When cancelling an update from ie. subversion ...,False,Usability Problem,Obsolete,Version Control. Subversion,sprice,2009,1,5,10
25-60686,Suspended Breakpoint in JVM not Recognized in ...,In some breakpoint cases for a app launched fr...,False,Bug,Duplicate,Java. Debugger,brigham,2009,1,5,19
25-60691,Good code red: IDEA incorrectly resolves neste...,The relevant snippet if part of the UIDebug cl...,False,Bug,Obsolete,Code Analysis. Inspection,xduke,2009,1,6,11
25-60679,Module WIll Not Load Jar Dependencies on Intre...,I have IntelliJ 7.0.5 running on both WIndows ...,False,Bug,Fixed,Core. Project Settings,stonemack,2009,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...
25-2507313,Version Control Incoming tab missing search fi...,"The Version Control tab for ""Incoming"" doesn't...",False,Usability Problem,Submitted,Version Control. Subversion,markhodgson,2019,9,26,10
25-2507159,Groovy compiler can't find files with names co...,1. Create a new Project with groovy support\n2...,False,Bug,Duplicate,,ted.lundqvist,2019,9,26,9
25-2507096,auto close stepped in files on debug resume,"Hi,\n\none coworker switched from eclipse to I...",False,Feature,Duplicate,Java. Debugger,alain57,2019,9,26,8
25-2506900,Keymaps don't resync after disabled settings s...,I had to disable setting sync so I could expor...,False,Bug,Duplicate,Core. IDE Settings. Sharing,Dmi3se,2019,9,25,22


In [195]:
test_data.summary.fillna('', inplace=True)
test_data.description.fillna('', inplace=True)

In [196]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize


In [197]:
import re

In [198]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\WildGoose\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\WildGoose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\WildGoose\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

*сначала происходит лемматизация слов, и потом они отфильтровываются по условия, так работать не должно*

In [233]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
       return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if re.fullmatch(r'[A-Za-z]{4,}', t) and t not in STOP_WORDS]

tfidf_vec_summary = TfidfVectorizer(min_df=.01, max_df=0.8,
                                   # token_pattern=r'[A-Za-z]{2,}',
                                    tokenizer=LemmaTokenizer(),
                             )
tfidf_vec_summary.fit(X_train_summary)
X_train_summary_sample = pd.DataFrame(tfidf_vec_summary.transform(X_train_summary).todense(),
                                      columns=tfidf_vec_summary.get_feature_names_out())
X_train_summary_sample



pd.options.display.max_rows = 200

X_train_summary_sample.columns.value_counts()

In [200]:
prep_train_data['Type.name'].value_counts()

Bug                                                   72504
Feature                                               14848
Usability Problem                                     10387
Cosmetics                                              2762
Performance Problem                                    2171
Exception                                              1474
Task                                                    764
Support Request                                         192
Configuration Problem                                   118
Meta Issue                                               89
Auto-reported Exception                                  34
Security Problem                                         21
Security (deprecated use Security Problem instead)        1
Name: Type.name, dtype: int64

tfidf_vec_desc = TfidfVectorizer(min_df=.01, max_df=0.5,
                                   # token_pattern=r'[A-Za-z]{2,}',
                                    tokenizer=LemmaTokenizer(),
                                    stop_words=STOP_WORDS)
tfidf_vec_desc.fit(X_train_description)
X_train_description_sample = pd.DataFrame(tfidf_vec_desc.transform(X_train_description).todense(),
                                          columns=tfidf_vec_desc.get_feature_names_out())
X_train_description_sample

In [201]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [202]:
prep_train_data

Unnamed: 0_level_0,summary,description,is_high_priority,Type.name,State.name,Subsystem.name,reporter_name,Year,Month,Day,Hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
25-60681,Don't sync font size within IDEA server plugin,At home I have 1900x1200 resolution and at my ...,False,Bug,Obsolete,Core. Project Settings,machak,2009,1,5,10
25-60682,Cancelling subversion update,When cancelling an update from ie. subversion ...,False,Usability Problem,Obsolete,Version Control. Subversion,sprice,2009,1,5,10
25-60686,Suspended Breakpoint in JVM not Recognized in ...,In some breakpoint cases for a app launched fr...,False,Bug,Duplicate,Java. Debugger,brigham,2009,1,5,19
25-60691,Good code red: IDEA incorrectly resolves neste...,The relevant snippet if part of the UIDebug cl...,False,Bug,Obsolete,Code Analysis. Inspection,xduke,2009,1,6,11
25-60679,Module WIll Not Load Jar Dependencies on Intre...,I have IntelliJ 7.0.5 running on both WIndows ...,False,Bug,Fixed,Core. Project Settings,stonemack,2009,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...
25-2507313,Version Control Incoming tab missing search fi...,"The Version Control tab for ""Incoming"" doesn't...",False,Usability Problem,Submitted,Version Control. Subversion,markhodgson,2019,9,26,10
25-2507159,Groovy compiler can't find files with names co...,1. Create a new Project with groovy support\n2...,False,Bug,Duplicate,,ted.lundqvist,2019,9,26,9
25-2507096,auto close stepped in files on debug resume,"Hi,\n\none coworker switched from eclipse to I...",False,Feature,Duplicate,Java. Debugger,alain57,2019,9,26,8
25-2506900,Keymaps don't resync after disabled settings s...,I had to disable setting sync so I could expor...,False,Bug,Duplicate,Core. IDE Settings. Sharing,Dmi3se,2019,9,25,22


In [203]:
prep_train_data = prep_train_data.sample(frac = 1)
chunks = np.array_split(prep_train_data, 12)
#y = chunks[0]['is_high_priority']

Unnamed: 0_level_0,summary,description,Type.name,State.name,Subsystem.name,reporter_name,Year,Month,Day,Hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
25-1305802,"Folded imports region opens when ""Organize imp...",I have my imports folded by default. When I ru...,Bug,Duplicate,Editor. Editing Text,chriskent,2016,9,17,18
25-2446508,[Feature] Using EditorConfig to configure code...,Rider can configure inspections using EditorCo...,Feature,Submitted,Code Analysis. Inspection,oandreyev,2019,7,31,15
25-1411366,Cleanup doesn't work: Can't create commit when...,I am unable to create git commits (Ctrl+K) if ...,Bug,Duplicate,Version Control. Git,C-Otto,2017,1,15,18
25-640628,"SVN: Default trunk location in repository ""Int...",What steps will reproduce the problem?\n1. Giv...,Bug,Duplicate,Version Control. Subversion,Niek_Boonman,2013,12,2,11
25-445455,Settings are lost every time something in mave...,"Hi,\nme and my colleagues having massively pro...",Bug,Fixed,Android,Kaheeson,2012,7,3,12
...,...,...,...,...,...,...,...,...,...,...
25-527242,Cursor jumps to first line when splitting file...,When I split the file view either Vertically o...,Bug,Duplicate,Editor. Editing Text,Imran.Zahid,2013,3,21,23
25-519026,JDK 1.8: CCE at InlineMethodProcessor.canInlin...,'''Build 128.123'''\n\nSample to reproduce:\n\...,Exception,Fixed,Java. Refactorings,vika,2013,3,11,12
25-233132,Build 99.32 is missing for affected versions i...,Please add it to the drop down.,Meta Issue,Fixed,,dittert,2011,1,2,17
25-335545,Commit dialog: use 'check spelling' button ins...,Please check the screenshot to get the idea - ...,Usability Problem,Fixed,Version Control,denis.zhdanov,2011,7,18,12


\# todo сократить размерность. Лемматизация, регулярные выражения поумнее, пороги для векторизации, объединить слова описания и заголовка?
отбор признаков по модели

In [268]:
from sklearn_pandas import DataFrameMapper

preprocessor = DataFrameMapper([('summary', TfidfVectorizer(min_df=.05, max_df=0.5,
                                   # token_pattern=r'[A-Za-z]{2,}',
                                    tokenizer=LemmaTokenizer())),
                               ('description', TfidfVectorizer(min_df=.15, max_df=0.4,
                                    tokenizer=LemmaTokenizer())),
                                (['Type.name'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
                                (['reporter_name'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
                                (['Subsystem.name'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
                                ('Year', None),
                                ('Month', None),
                                ('Day', None),
                                ('Hour', None),
                                ],
                               input_df=True,
                               df_out=True)



In [210]:
X, Y = chunks[0].drop(['is_high_priority'], axis=1), chunks[0]['is_high_priority']


In [269]:
test_data = preprocessor.fit_transform(X)




KeyboardInterrupt: 

In [265]:
test_data


Unnamed: 0_level_0,summary_code,summary_error,summary_file,summary_idea,summary_project,summary_window,description_build,description_change,description_class,description_code,...,Subsystem.name_x0_Version Control. Log,Subsystem.name_x0_Version Control. Mercurial,Subsystem.name_x0_Version Control. Perforce,Subsystem.name_x0_Version Control. StarTeam (archived),Subsystem.name_x0_Version Control. Subversion,Subsystem.name_x0_Version Control. TFS (archived),Year,Month,Day,Hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25-1054946,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,2015,11,4,16
25-2014436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238547,0.213999,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,2018,6,6,18
25-159962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.284768,0.958596,...,0.0,0.0,0.0,0.0,0.0,0.0,2010,3,9,14
25-511332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,2013,2,18,11
25-341765,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.012817,0.000000,0.019353,...,0.0,0.0,0.0,0.0,0.0,0.0,2011,8,24,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25-465315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,2012,8,27,2
25-487860,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,2012,11,15,13
25-522932,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,2013,3,14,22
25-2286962,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,2019,3,18,13


In [None]:
Y = Y.astype(np.int8)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(, Y,
                                                    stratify=y,
                                                    random_state=3,
                                                    test_size=0.25,
                                                    shuffle=True)

In [None]:
# https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge, PassiveAggressiveRegressor, SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB, MultinomialNB, ComplementNB

In [None]:
# todo надо бужет скэйлить!
clf = ComplementNB()

clf.fit(X_train,y_train)

In [None]:


y_train_prediction = clf.predict(X_test)
#print (f'Test f1 = {f1_score(y_true=y_test, y_pred=nb_pipeline.predict(X_test))}')

In [None]:
y_test_pred = clf.predict(X_prep_test)

In [None]:
score = clf.score(X_prep, y_train)
score

In [None]:
print (f'Train f1 = {f1_score(y_true=y_train, y_pred=clf.predict(X_prep))}')

In [None]:
clf.predict(X_test)

In [None]:


nb_pipeline = Pipeline(steps = [('tfidf', preprocessor),
                                ('to_dense', DenseTransformer()),
                                ('clf', ComplementNB())
                               ]
                       )

nb_pipeline.fit(X_train, y_train)

print (f'Train f1 = {f1_score(y_true=y_train, y_pred=nb_pipeline.predict(X_train))}')
print (f'Test f1 = {f1_score(y_true=y_test, y_pred=nb_pipeline.predict(X_test))}')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge, PassiveAggressiveRegressor, SGDClassifier
param_grid = dict(min_df=[0.01, .05], max_df=[.7, .8])

results = []

for params in tqdm(ParameterGrid(param_grid)):
#     print(params)
    pipe = Pipeline(steps = [('tfidf', TfidfVectorizer(min_df=params['min_df'], max_df=params['max_df'],
                                                       token_pattern=r'[A-Za-z]{2,}',
                                                       stop_words=STOP_WORDS)),
                             ('to_dense', DenseTransformer()),
                             ('clf', ComplementNB())
                            ]
                       )

    #cv_train_preds = cross_val_predict(pipe, X_text_train, y_train, cv = 5, n_jobs = -1)

    pipe.fit(X_train, y_train)
    train_preds = pipe.predict(X_train)

    results.append(dict(
                        estimator=pipe,
                        parameters=params,
                        train_f1 = f1_score(y_true=y_train, y_pred=pipe.predict(X_train)),
                        test_f1 = f1_score(y_true=y_test, y_pred=pipe.predict(X_test))
    ))

In [None]:
pd.DataFrame(results).drop(columns='estimator').sort_values('test_f1').style.bar(vmin=0, vmax=1)

In [None]:
json_fields = pd.json_normalize(train_data.customFields.map(json.loads).map(lambda x: {field['name']: field['value'] for field in x}))
json_fields.set_index(train_data.index, inplace=True)
json_fields.head()

In [None]:
json_fields.info()

In [None]:
json_columns = ['Type.name' , 'Subsystem.name']

In [None]:
train_data = train_data.join(json_fields[json_columns], how='outer', lsuffix='_left')

train_data['reporter_name'] = train_data.reporter.map(json.loads).map(operator.itemgetter('login'))
train_data['Year'] = train_data.created.apply(lambda c: pd.to_datetime(c, unit='ms').year)
train_data['Month'] = train_data.created.apply(lambda c: pd.to_datetime(c, unit='ms').month)
train_data['Day'] = train_data.created.apply(lambda c: pd.to_datetime(c, unit='ms').day)
train_data['Hour'] = train_data.created.apply(lambda c: pd.to_datetime(c, unit='ms').hour)

In [None]:
train_data

In [None]:
columns = ['summary', 'reporter_name', 'Year','Month', 'Day', 'Hour'] +  json_columns
#columns = ['summary'] +  json_columns

X = train_data[columns]
y = train_data['is_high_priority']

In [None]:
X.Year.value_counts()

In [None]:
pipe = make_pipeline(preprocessor, ComplementNB())

pipe.fit(X, y)

In [None]:
probabilities = pipe.predict_proba(X)[:, 1]

In [None]:
probabilities

In [None]:
threshold = 0.6

In [None]:
f1_score(y_true=y, y_pred=probabilities > threshold)