In [174]:
import pandas as pd
import bs4
import re

from tqdm.notebook import tqdm
tqdm.pandas()
DEBUG = False

### Getting the files
We have the raw files from the database.
These are extracted in **Microsoft SQL Manager Studio 18**, by calling a `SELECT * FROM TABLE`.
Then, on the result-set right-click and press `save result as`.

In [175]:
PATH_REQUESTS = 'data/database/Requests.csv'

df_requests = pd.read_csv(PATH_REQUESTS,
                 encoding='UTF-8',
                 delimiter=';',
                 quotechar='"',
                 dtype=str,
                 usecols=['id'])

df_requests = df_requests.fillna('')
if DEBUG:
    df_requests = df_requests[df_requests['id'] == '109842828']

In [176]:
PATH_RELATIONS = 'data/database/RelationHistory.csv'

df_relations = pd.read_csv(PATH_RELATIONS,
                 encoding='UTF-8',
                 delimiter=';',
                 quotechar='"',
                 dtype=str,
                 usecols=['tblid', 'leftID', 'rightID'])

if DEBUG:
    df_relations = df_relations[df_relations['leftID'] == '109842828']

df_relations = df_relations.fillna('')
df_relations = df_relations.sort_values(by=['tblid'])
df_relations = df_relations.drop_duplicates(subset=['tblid'], keep='last')
df_relations = df_relations.rename(columns={'leftID': 'leftId', 'rightID': 'rightId'})

In [177]:
df_r = pd.merge(df_requests, df_relations, left_on='id', right_on='leftId', how='left')
df_r = df_r.drop(columns=['id'])

In [178]:
def name_changed(x):
    if x['name'] == 'RequestIncidentResponsible':
        return 'RequestServiceResponsible'
    if x['name'] == 'RequestIncidentReceivedBy':
        return 'RequestServiceReceivedBy'
    if x['name'] == 'RequestIncidentUser':
        return 'RequestServiceUser'
    return x['name']

In [179]:
PATH_OBJECTS = 'data/database/ObjectHistory.csv'

df_objects = pd.read_csv(PATH_OBJECTS,
                 encoding='UTF-8',
                 delimiter=';',
                 quotechar='"',
                 dtype=str,
                 usecols=['tblid', 'name'])

df_objects = df_objects.fillna('')
df_objects = df_objects[df_objects['name'].isin([
    'RequestServiceResponsible',
    'RequestServiceReceivedBy',
    'RequestServiceUser',
    'RequestIncidentResponsible',
    'RequestIncidentReceivedBy',
    'RequestIncidentUser',
])]

df_objects['name_changed'] = df_objects.apply(lambda x: name_changed(x), axis=1)
df_objects = df_objects.drop(columns=['name'])

df_objects = df_objects.sort_values(by=['tblid'])
df_objects = df_objects.rename(columns={'tblid': 'objectTblid'})

In [180]:
df_o = pd.merge(df_r, df_objects, left_on='tblid', right_on='objectTblid', how='inner')
df_o = df_o.drop_duplicates(subset=['leftId', 'name_changed'], keep='last')
df_o = df_o.drop(columns=['tblid', 'objectTblid'])

In [181]:
PATH_ITEMS = 'data/database/Items.csv'

df_items = pd.read_csv(PATH_ITEMS,
                 encoding='UTF-8',
                 delimiter=';',
                 quotechar='"',
                 dtype=str,
                 usecols=['id', 'username'])

df_items = df_items.fillna('')
df_items = df_items[df_items['username'] != '']
df_items = df_items.rename(columns={'id': 'itemId'})

In [182]:
df_i = pd.merge(df_o, df_items, left_on='rightId', right_on='itemId', how='inner')
df_i = df_i.drop(columns=['rightId', 'itemId'])

In [183]:
df_i = df_i.sort_values(by='name_changed')


In [184]:
cc = df_i.groupby(['leftId']).cumcount() + 1
df = df_i.set_index(['leftId', cc]).unstack().sort_index(1, level=1)
df.columns = ['_'.join(map(str,i)) for i in df.columns]
df.reset_index()
df = df.fillna('unknown')

  df = df_i.set_index(['leftId', cc]).unstack().sort_index(1, level=1)


In [185]:
def get_role(x, role):
    if x['name_changed_1'][-len(role):] == role:
        return x['username_1'].lower()
    if x['name_changed_2'][-len(role):] == role:
        return x['username_2'].lower()
    if x['name_changed_3'][-len(role):] == role:
        return x['username_3'].lower()
    return 'unknown'

def get_closed(x):
    if x['label_responsible'] != 'unknown':
        return x['label_responsible']
    return x['label_received']

df['label_user'] = df.apply(lambda x: get_role(x, 'User'), axis=1)
df['label_received'] = df.apply(lambda x: get_role(x, 'ReceivedBy'), axis=1)
df['label_responsible'] = df.apply(lambda x: get_role(x, 'Responsible'), axis=1)
df['label_closed'] = df.apply(lambda x: get_closed(x), axis=1)

In [186]:
df[['label_user', 'label_received', 'label_responsible', 'label_closed']].to_csv('data/label_users.csv')

df = pd.read_csv('data/label_users.csv')
df = df.rename(columns={'leftId': 'id'})

df.to_csv('data/label_users.csv', index=False)

In [190]:
from sklearn import preprocessing

def get_labels(labels):
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    return le

df = pd.read_csv('data/label_users.csv')
df = df[df['label_closed'] != 'unknown']

top_list = df['label_closed'].value_counts().index.tolist()
tmp = df[df['label_closed'].isin(top_list[:100])]
label_encoder = get_labels(tmp['label_closed'].to_numpy())
tmp['label_encoded'] = label_encoder.transform(tmp['label_closed'])
tmp[['id', 'label_closed', 'label_encoded']].to_csv('data/label_users_top_100.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['label_encoded'] = label_encoder.transform(tmp['label_closed'])
