In [1]:
import pandas as pd

In [67]:
from data_preprocessing.DataManage import DataManage
from models.BertCustomizer import BertCustomizer
import matplotlib.pyplot as plt
from train import get_df
import os
import sys
import tqdm

In [3]:
if __name__ == '__main__':
    os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"
    df_complete = get_df()

In [71]:

    # RefIssues
    df_test = df_complete[df_complete["RefIssue"].notnull()]["RefIssue"].rename("IssueId").to_frame()
    df_test = df_test.reset_index()
    df_test = df_test.drop("index", axis=1)
    df_test_2 = df_complete[df_complete["IssueId"].astype(str).isin(df_test["IssueId"].values)][["IssueId", "AssigneeLogin"]]
    df_test_2 = df_test_2.rename(columns={"AssigneeLogin": "RefAssignee"})
    df_final = df_complete.merge(df_test_2, on="IssueId", how="left")

    # Workload

In [68]:
    # For each assigneeLogin get all status opened issues
    print("WORKLOAD CALCULATOR")
    for index in tqdm.tqdm(range(df_complete.shape[0])):
        if index == 0:
            df_workload_complete = []
        else:
            actual_date = df_complete.iloc[index]["CreatedAt"]
            df_selected_issues = df_complete.iloc[:index]
            df_workload = {}
            for assignee in list(set(df_complete["AssigneeLogin"])):
                condition1 = df_selected_issues["ClosedAt"] >= actual_date
                condition2 = df_selected_issues["AssigneeLogin"] == assignee
                a = str(assignee) + "_Wl"
                df_workload[a] = df_selected_issues[condition1 & condition2].shape[0]
            df_workload_complete.append(df_workload)
    print(pd.DataFrame.from_dict(df_workload_complete))





WORKLOAD CALCULATOR


100%|██████████| 14263/14263 [16:56<00:00, 14.04it/s]


       alexr00_Wl  miguelsolorio_Wl  DonJayamanne_Wl  connor4312_Wl  \
0               0                 0                0              0   
1               0                 0                0              0   
2               0                 0                0              0   
3               0                 0                0              0   
4               0                 0                0              0   
...           ...               ...              ...            ...   
14257          24                 2                0             10   
14258          24                 2                0             10   
14259          24                 2                0             10   
14260          24                 2                0             10   
14261          24                 2                0             10   

       bamurtaugh_Wl  roblourens_Wl  dbaeumer_Wl  int19h_Wl  daviddossett_Wl  \
0                  0              0            0          0        

In [69]:
    workload = pd.DataFrame.from_dict(df_workload_complete)
    workload.to_csv("./db/workload.csv")

In [82]:
    workload_cols_name = list(workload.columns)
    df_final2 = pd.concat([df_final,workload], axis=1)

    selected_variables = ['CleanText', 'AssigneeLogin', 'RefAssignee', 'CreatedAt'] + workload_cols_name
    df_final2["CreatedAt"] = pd.to_numeric(pd.to_datetime(df_final2["CreatedAt"]))
    df = df_final2[selected_variables]

                                               CleanText   AssigneeLogin  \
0      [typescript kernel tslab] VSCode's intellisens...           mjbvz   
1      Webivew input events cannot focus datalist ite...      deepak1556   
2      Multiple terminal windows restore correct sess...      meganrogge   
3      Enhance workspacePlatform context key web case...    JacksonKearl   
4      Remove legacy welcome page code This iteration...    JacksonKearl   
...                                                  ...             ...   
14258  Screen reader read quick pick title field set ...  TylerLeonhardt   
14259  Add aria label check checkbox <!-- Thank submi...  TylerLeonhardt   
14260  Extension issue - Issue Type: `Bug`\r - Extens...        sandy081   
14261  Terminal cut letters. ADD ISSUE DESCRIPTION HE...      meganrogge   
14262  update distro Updating distro take latest chan...         isidorn   

      RefAssignee            CreatedAt  alexr00_Wl  miguelsolorio_Wl  \
0             N

In [None]:

    # Data Ingestion
    data_manage = DataManage(
        data_frame=df,
        text_col_name="CleanText",
        label_col_name="AssigneeLogin",
        random_split_method=False,
        test_percentage=20,
        val_percentage=20
    )

    # On Entire DataFrame
    data_manage.split(type_check=False, show_info=True)
    data_manage.transform()

    # On Textual Variable
    data_manage.remove_special_characters()
    print("\n\n\nAFTER SPECIAL")
    print(data_manage.X_train_text_pp)
    data_manage.remove_stopwords()
    print("\n\n\nAFTER STOPWORDS")
    print(data_manage.X_train_text_pp)
    data_manage.lemmatization()
    print("\n\n\nAFTER LEMMATIZATION")
    print(data_manage.X_train_text_pp)
    data_manage.to_lowercase()
    print("\n\n\nAFTER LOWERCASE")
    print(data_manage.X_train_text_pp)

    # Model
    bert_customizer = BertCustomizer(bert_model_name="bert_en_uncased_L-12_H-768_A-12",
                                     info=data_manage.info,
                                     show_info=True)
    print("\n\n\ndata_manage.info")
    print(data_manage.info)
    print("\n\n\ndata_manage.info['additional_input_count']")
    print(data_manage.info["additional_input_count"])
    bert_customizer.build(bert_trainability=True)
    bert_customizer.compile()
    train_generator = bert_customizer.get_tf_dataset_from_generator(
        text=data_manage.X_train_text_pp.apply(lambda x: ' '.join(x)),
        X=data_manage.X_train_pp,
        y=data_manage.y_train_pp)
    val_generator = bert_customizer.get_tf_dataset_from_generator(
        text=data_manage.X_val_text_pp.apply(lambda x: ' '.join(x)),
        X=data_manage.X_val_pp,
        y=data_manage.y_val_pp)
    history = bert_customizer.model.fit(train_generator,
                                        validation_data=val_generator,
                                        epochs=20)
    history_dict = history.history
    print(history_dict.keys())
    acc = history_dict['categorical_accuracy']
    val_acc = history_dict['val_categorical_accuracy']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']
    epochs = range(1, len(acc) + 1)
    fig = plt.figure(figsize=(10, 6))
    fig.tight_layout()
    plt.subplot(2, 1, 1)
    # r is for "solid red line"
    plt.plot(epochs, loss, 'r', label='Training loss')
    # b is for "solid blue line"
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    # plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.subplot(2, 1, 2)
    plt.plot(epochs, acc, 'r', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.savefig("./models/bert_history.png")