In [2]:
#!pip install spacy-langdetect

Collecting spacy-langdetect
  Downloading spacy_langdetect-0.1.2-py3-none-any.whl (5.0 kB)
Collecting langdetect==1.0.7
  Downloading langdetect-1.0.7.zip (998 kB)
     -------------------------------------- 998.1/998.1 kB 3.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.7-py3-none-any.whl size=993439 sha256=3cc6d52711d958e89c4c8aff5fa1ab9534bd210a987bcc73954d29297a8d83f1
  Stored in directory: c:\users\wei.liu\appdata\local\pip\cache\wheels\87\8c\9a\41c0647bd03b3e11ca6968d3638a4e6e764220adf2886270cb
Successfully built langdetect
Installing collected packages: langdetect, spacy-langdetect
Successfully installed langdetect-1.0.7 spacy-langdetect-0.1.2


In [3]:
#!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
     -------------------------------------- 587.7/587.7 MB 1.7 MB/s eta 0:00:00
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [1]:
import pandas as pd
import numpy as np
import re
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from hdbscan import HDBSCAN
import hdbscan
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import v_measure_score
from umap import UMAP

import time
from unidecode import unidecode


In [2]:
# load processed data from clean folder
df_app = pd.read_csv('data/clean/clean_uncat_app_pst.csv')
#df_csl = pd.read_csv('data/clean/clean_uncat_csl_pst.csv')
df_labeled_2223 = pd.read_csv('data/clean/cleaner_labeled_data_2223.csv')


## Step 2.3 Shuffle Data

In [3]:
# shuffle first because some requests may be seasonal, such as drainage, shuffle to avoid the data being time biased
df_apps = df_app.sample(frac=1, random_state=123, ignore_index=True)
#df_csls = df_csl.sample(frac=1, random_state=123, ignore_index=True)

## Step 3. Train-Test split

In [4]:
# split labeled data into train, valid, and test sets
# train set merge with other_issues to perform semi-supervised learning
# valid set for tuning hyper-parameters, test set for testing

# shuffle 
df_labeled_2223s = df_labeled_2223.sample(frac=1, random_state=123, ignore_index=True)
train_set = df_labeled_2223s.iloc[:int(0.7*df_labeled_2223s.shape[0]),:]
val_set = df_labeled_2223s.iloc[int(0.7*df_labeled_2223s.shape[0]):int(0.85*df_labeled_2223s.shape[0]),:]
tst_set = df_labeled_2223s.iloc[int(0.85*df_labeled_2223s.shape[0]):, :]

In [5]:
df_cmb_app = pd.concat([df_apps, train_set], axis=0, ignore_index=True)

print(df_cmb_app.shape)


(16972, 7)


In [6]:
# The original data, used for prediction after model tuning
doc_app = df_app['Description'].to_list()

doc_cmb_app = df_cmb_app['Description'].to_list()

doc_trn = train_set['Description'].to_list()
doc_val = val_set['Description'].to_list()
doc_tst = tst_set['Description'].to_list()

y_cmb_app = df_cmb_app['semi_label'].to_list()
y_trn = train_set['semi_label'].to_list()
y_val = val_set['semi_label'].to_list()
y_tst = tst_set['semi_label'].to_list()

# Semi-Supervised Model 

In [7]:

# Sentence Transformers convert docs (service requests) into numeric representations
#sentence_model = SentenceTransformer("all-mpnet-base-v2")
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# regenerate key words for topic representation
representation_model = KeyBERTInspired()
#representation_model = MaximalMarginalRelevance(diversity=0.5)

vectorizer_model = CountVectorizer(stop_words="english")
metric='euclidean'

In [8]:
# pre-calculate embeddings to save time
st = time.time()
dapp_embeddings = sentence_model.encode(doc_app, show_progress_bar=False)
#dcsl_embeddings = sentence_model.encode(doc_csl, show_progress_bar=False)

cmb_app_embeddings = sentence_model.encode(doc_cmb_app, show_progress_bar=False)
#cmb_csl_embeddings = sentence_model.encode(doc_cmb_csl, show_progress_bar=False)
trn_embeddings = sentence_model.encode(doc_trn, show_progress_bar=False)
val_embeddings = sentence_model.encode(doc_val, show_progress_bar=False)
tst_embeddings = sentence_model.encode(doc_tst, show_progress_bar=False)
elapsed = time.time()-st
print("Run time: " + time.strftime("%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:11], time.gmtime(elapsed)))

Run time: 00:13:31.00


## Use the default setting to set up a baseline

In [9]:
# app data set
st = time.time()
topic_model_base_app = BERTopic()
                        
base_app_topics, base_app_probs = topic_model_base_app.fit_transform(doc_cmb_app, cmb_app_embeddings, y=y_cmb_app)


y_trn_base, tr_probs_base = topic_model_base_app.transform(doc_trn, trn_embeddings)
y_val_base, val_probs_base = topic_model_base_app.transform(doc_val, val_embeddings)
print(v_measure_score(y_trn_base, y_trn))
print(v_measure_score(y_val_base, y_val))

elapsed = time.time()-st
print("Run time: " + time.strftime("%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:11], time.gmtime(elapsed)))

  self._set_arrayXarray(i, j, x)


0.4303009273102496
0.41078062705797946
Run time: 00:01:38.11


## Random Search
There are 4 hyper-parameters that are important for the performance of Bertopic model:  
2 hyper-parameters for umap, which is for dimension reduction;  
and 2 hyper-parameters for hdbscan, which is for clustering documents.  
Unfortunately, due to the limitation of computing power and time, we can not use grid search for hyper-parameters tuning, which will lead to thounsands of times of model training and testing.  
Instead we'll implement a Random search.

In [20]:
# Generate n sets of random parameters in certain range

# number of parameter sets
n = 50

np.random.seed(1)
# n_neighbors
nnbs = np.random.randint(low=5, high=30, size=n)
# n_component
ncpts = np.random.randint(low=5,high=30, size=n)
# min_cluster_size
mc_sizes = np.random.randint(low=10, high=60, size=n)
# min_sample_size
ms_sizes = np.random.randint(low=10, high=60, size=n)

In [21]:

# Sentence Transformers convert docs (service requests) into numeric representations
#sentence_model = SentenceTransformer("all-mpnet-base-v2")
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# regenerate key words for topic representation
representation_model = KeyBERTInspired()
#representation_model = MaximalMarginalRelevance(diversity=0.5)

vectorizer_model = CountVectorizer(stop_words="english")
metric='euclidean'

In [22]:
st = time.time()
tr_score, val_score, cnt_topics = [],[],[]
for nnb,ncpt,mcs,mss in zip(nnbs, ncpts,mc_sizes,ms_sizes):
    # UMAP model handles demension reduction which can be helpful with large datasets
    umap_model = UMAP(n_neighbors=nnb, n_components=ncpt, metric=metric, random_state=123)
    # HDB Scan works on clustering groups of similar embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=mcs, min_samples=mss, prediction_data=True)
    
    topic_model_cmb_app = BERTopic(nr_topics="auto", umap_model=umap_model,embedding_model=sentence_model,\
                                   hdbscan_model=hdbscan_model, representation_model=representation_model,\
                                   vectorizer_model=vectorizer_model,min_topic_size=mcs, calculate_probabilities=True)
                        
    cmb_app_topics, cmb_app_probs = topic_model_cmb_app.fit_transform(doc_cmb_app, cmb_app_embeddings, y=y_cmb_app)

    
    cnt = max(cmb_app_topics)
    cnt_topics.append(cnt)
        
    y_trn_pred, tr_probs = topic_model_cmb_app.transform(doc_trn, trn_embeddings)
    y_val_pred, val_probs = topic_model_cmb_app.transform(doc_val, val_embeddings)
        
    tr_score.append(v_measure_score(y_trn_pred, y_trn))
    val_score.append(v_measure_score(y_val_pred, y_val))
    
elapsed = time.time()-st
print("Run time: " + time.strftime("%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:11], time.gmtime(elapsed)))

  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._se

  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)


Run time: 01:45:05.25


In [23]:
d_scores = {'params':zip(nnbs, ncpts,mc_sizes,ms_sizes),'training_score':tr_score, 'val_score':val_score, 'topic_count':\
           cnt_topics}
gd_scores = pd.DataFrame(data=d_scores)
gd_scores.sort_values(by='val_score',ascending=False, inplace=True,ignore_index=True)

gd_scores

Unnamed: 0,params,training_score,val_score,topic_count
0,"(24, 5, 43, 47)",0.575809,0.556108,25
1,"(25, 28, 46, 43)",0.583999,0.552549,24
2,"(28, 24, 33, 59)",0.582774,0.545295,20
3,"(16, 22, 10, 51)",0.555445,0.543204,27
4,"(16, 12, 49, 30)",0.574522,0.541159,29
5,"(27, 20, 42, 25)",0.574882,0.537345,29
6,"(15, 28, 54, 50)",0.5757,0.536146,21
7,"(18, 19, 28, 46)",0.575082,0.535946,28
8,"(14, 15, 38, 16)",0.559881,0.535879,29
9,"(11, 20, 21, 58)",0.564084,0.531548,23


### The optimal model: Although the validation score is a very critical metric for the evaluation of the model performance, there are other factor to consider here, such as number of topics, number of outliers, and the saliency of the topics, after experimenting on the top 3 sets of parameter, the following set of hyper parameters turn out to be the optimal. 

In [10]:
# hyper-params that gain best val_score
n_nbors, n_components, min_cluster_size, min_samples = (20, 29, 12, 31)


In [11]:
#UMAP model handles demension reduction which can be helpful with large datasets
umap_model = UMAP(n_neighbors=n_nbors, n_components=n_components, metric=metric, random_state=123)
# HDB Scan works on clustering groups of similar embeddings
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, prediction_data=True)
st = time.time()
topic_model_op_app = BERTopic(nr_topics="auto", umap_model=umap_model,embedding_model=sentence_model, hdbscan_model=hdbscan_model, \
                       representation_model=representation_model, vectorizer_model=vectorizer_model,\
                            min_topic_size=min_cluster_size, calculate_probabilities=True)
                        
cmb_app_topics, cmb_app_probs = topic_model_op_app.fit_transform(doc_cmb_app, cmb_app_embeddings, y=y_cmb_app)



y_trn_pred_1, tr_probs_1 = topic_model_op_app.transform(doc_trn, trn_embeddings)
y_val_pred_1, val_probs_1 = topic_model_op_app.transform(doc_val, val_embeddings)
print(v_measure_score(y_trn_pred_1, y_trn))
print(v_measure_score(y_val_pred_1, y_val))

elapsed = time.time()-st
print("Run time: " + time.strftime("%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:11], time.gmtime(elapsed)))

  self._set_arrayXarray(i, j, x)


0.547630984087213
0.5191681586871684
Run time: 00:03:32.04


## Examine the Optimal model

In [12]:
# Model was assessed on how well-defined topics were and what topics were created. Hyperparameters can be adjusted to get different topics.
# The key to find a balance between topics that are specific but not too granular
pd.set_option('display.max_rows', None)
topic_model_op_app.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5612,-1_parked_parking_park_vehicle,"[parked, parking, park, vehicle, driveway, sid...",[I reported this as an abandoned vehicle and 3...
1,0,2460,0_homeless_encampment_encampments_camp,"[homeless, encampment, encampments, camp, side...","[Dumped trash around corner trash can, Homeles..."
2,1,1693,1_parked_parking_rv_park,"[parked, parking, rv, park, rvs, vehicle, stre...",[Car has been parked out in the street for sev...
3,2,1441,2_potholes_pothole_sinkhole_manhole,"[potholes, pothole, sinkhole, manhole, pavemen...","[Pothole in the middle of the street , large p..."
4,3,622,3_music_loud_sound_noise,"[music, loud, sound, noise, noisy, speakers, n...","[Loud music, Music to loud , Loud music ]"
5,4,600,4_traffic_streetlights_stoplight_lights,"[traffic, streetlights, stoplight, lights, str...","[Traffic light out, Traffic light is out, Gree..."
6,5,507,5_tree_trees_branch_branches,"[tree, trees, branch, branches, pine, sidewalk...",[A big tree branch fell. Please remove the br...
7,6,447,6_san_avenue_st_east,"[san, avenue, st, east, street, near, monterey...",[Between Santa Clara and San Fernando on the w...
8,7,443,7_hydrant_leaking_leak_leakage,"[hydrant, leaking, leak, leakage, water, drain...","[Fire hydrant is leaking water., Water leaking..."
9,8,348,8_sign_stopping_signage_signs,"[sign, stopping, signage, signs, broken, stop,...","[Broken stop sign, Missing street sign. Not su..."


In [28]:
df_cmb_app['Description'][np.array(cmb_app_topics)==23][:800].values

array(['do not want to save password',
       'Can ?t create 311 account due to missing password entry on create 311 account webpage.',
       'why a password for a trash account.....',
       'San Jose 311 Other Issues ID #230504-000870  Lan Diep',
       'Here are pictures of the issue I raised in issue I reported: 220410-000072.',
       'Please update  Company - JC Dukes   Phone # 415/671-1250',
       "page has no info, it's blank", 'Unable to open attachment',
       'This is the second time I ?ve reported this the first time was closed and nothing was done.',
       'The 311 mobile app does not remember my password on IOS. I ?ve change my password already and it still nags me to do it again. Every time I launch the app it forces me to enter my full email and password. There is no option to remember anything.  That makes the app very cumbersome and seems incredibly dated. Can you fix this soon?',
       'the reason for this request was not provided.  closing the request.',
      

In [23]:
topic_model_op_2223.save('model_cmb_op_2223')

## topic merging

In [13]:
topics_to_merge_app = [[0,24],[1,11,13,17,26,31,33,35],[4,21],[5,9],[7,15,18],[8,29],[10,20,22,28,32,38],[14,27,30],[16,34]]
topic_model_op_app.merge_topics(doc_cmb_app, topics_to_merge_app)

cmb_app_mg_topics = topic_model_op_app.topics_[:]

topic_model_op_app.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5612,-1_parked_parking_park_vehicle,"[parked, parking, park, vehicle, driveway, veh...",[Double parked commercial truck in residential...
1,0,2539,0_homeless_encampments_encampment_trash,"[homeless, encampments, encampment, trash, rec...","[Dumped trash around corner trash can, Homeles..."
2,1,2299,1_parked_parking_vehicle_park,"[parked, parking, vehicle, park, car, vehicles...",[The Vehicle is parked for 3 weeks and not mov...
3,2,1441,2_potholes_pothole_sinkhole_manhole,"[potholes, pothole, sinkhole, manhole, pavemen...","[Pothole in the middle of the street , large p..."
4,3,821,3_tree_trees_branches_branch,"[tree, trees, branches, branch, pine, overgrow...","[Overgrown tree on the sidewalk, tree branches..."
5,4,688,4_lights_green_stoplight_streetlight,"[lights, green, stoplight, streetlight, light,...",[The green light on 2 of the lights heading So...
6,5,670,5_leaking_leak_leakage_hydrant,"[leaking, leak, leakage, hydrant, drainage, dr...",[Water leaking non stop. Lots of water goin...
7,6,622,6_music_loud_sound_noise,"[music, loud, sound, noise, noisy, speakers, n...","[Loud music , Loud music , Very loud music]"
8,7,472,7_barking_bark_barks_neighbor,"[barking, bark, barks, neighbor, neighbors, do...",[Neighbors dog has been barking for over two h...
9,8,447,8_east_near_san_avenue,"[east, near, san, avenue, west, south, park, m...","[near Park Ave., On the San Fernando and Third..."


In [14]:
y_trn_pred, trn_probs = topic_model_op_app.transform(doc_trn, trn_embeddings)
y_val_pred, val_probs = topic_model_op_app.transform(doc_val, val_embeddings)


print(v_measure_score(y_trn_pred, y_trn))
print(v_measure_score(y_val_pred, y_val))

0.5842848718322512
0.5528093922639147


In [15]:
# Predict on 2223 other issues data
st = time.time()
df_app_topics, df_app_probs = topic_model_op_app.transform(doc_app, dapp_embeddings)
elapsed = time.time()-st
print("Run time: " + time.strftime("%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:11], time.gmtime(elapsed)))

Run time: 00:02:22.66


## reduce outliers

In [16]:
# valiation score after topic merging and outlier reduction 
dist_val_pred = topic_model_op_app.reduce_outliers(doc_val, y_val_pred, strategy="distributions", distributions_params={'window':3,'stride':1})
v_measure_score(dist_val_pred, y_val)

0.591386604102205

In [17]:
print(f'number of potholes in val_set:{val_set[val_set.semi_label==0].shape[0]}')
print(f'number of parking in val_set: {val_set[val_set.semi_label==1].shape[0]}')
print(f'number of trash in val_set: {val_set[val_set.semi_label==2].shape[0]}')

number of potholes in val_set:310
number of parking in val_set: 305
number of trash in val_set: 179


In [18]:
print(100*sum((np.array(dist_val_pred)==2)&(val_set.semi_label==0))/val_set[val_set.semi_label==0].shape[0])
print(100*sum((np.array(dist_val_pred)==1)&(val_set.semi_label==1))/val_set[val_set.semi_label==1].shape[0])
print(100*(sum((np.array(dist_val_pred)==0)&(val_set.semi_label==2))+sum((np.array(dist_val_pred)==10)&(val_set.semi_label==2)))/val_set[val_set.semi_label==2].shape[0])

86.12903225806451
88.85245901639344
75.41899441340782


In [33]:
# test set score
y_tst_pred, tst_probs = topic_model_op_app.transform(doc_tst, tst_embeddings)
dist_tst_pred = topic_model_op_app.reduce_outliers(doc_tst, y_tst_pred, strategy='distributions', distributions_params={'window':3,'stride':1})

print(f'Score of build-in method: {v_measure_score(dist_tst_pred, y_tst)}')


Score of build-in method: 0.6230266562336086


In [34]:
dapp_new_topics = topic_model_op_app.reduce_outliers(doc_app, df_app_topics, strategy="distributions", distributions_params={'window':3,'stride':1})

In [35]:
topic_model_op_app.update_topics(docs=doc_app, topics=dapp_new_topics)
topic_model_op_app.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2,-1_still_here_front_in,"[still, here, front, in, , , , , , ]",[Double parked commercial truck in residential...
1,0,2393,0_the_homeless_and_to,"[the, homeless, and, to, of, trash, in, garbag...","[Dumped trash around corner trash can, Homeles..."
2,1,3135,1_parked_the_for_in,"[parked, the, for, in, and, on, this, car, is,...",[The Vehicle is parked for 3 weeks and not mov...
3,2,363,2_the_road_hole_lane,"[the, road, hole, lane, cover, and, is, on, it...","[Pothole in the middle of the street , large p..."
4,3,1136,3_tree_weeds_the_and,"[tree, weeds, the, and, trees, to, is, of, bra...","[Overgrown tree on the sidewalk, tree branches..."
5,4,885,4_light_traffic_the_turn,"[light, traffic, the, turn, green, lights, to,...",[The green light on 2 of the lights heading So...
6,5,899,5_water_drain_the_leaking,"[water, drain, the, leaking, is, of, leak, sto...",[Water leaking non stop. Lots of water goin...
7,6,681,6_loud_music_noise_and,"[loud, music, noise, and, the, to, it, is, at,...","[Loud music , Loud music , Very loud music]"
8,7,549,7_dog_barking_dead_dogs,"[dog, barking, dead, dogs, the, and, to, cat, ...",[Neighbors dog has been barking for over two h...
9,8,709,8_san_ave_and_st,"[san, ave, and, st, dr, rd, of, corner, jose, on]","[near Park Ave., On the San Fernando and Third..."


In [38]:
df_app['Description'][np.array(dapp_new_topics)==12]

12       This guy was watching closely when I parked my...
40            Person is now starting to block the sidewalk
48       Is this blocked exit door a fire safety issue?...
68       This kind of driving happens all the time. Car...
77       Groups of people following people around or in...
79       Got assaulted by this man today between 12:15 ...
101      Major traffic backups on 87 offramp, technolog...
219      Prostitutes at the corner this morning around ...
353      3 teens going car to car last night on our str...
433      Between Rhoda and Boynton.   Suspicious man wa...
443      I came home today and saw that the house nextd...
502      Red Dodge Durango Suv with black male youth sl...
526      On January 2$ this San Jose city vehicle #1018...
539      My car driver side front door was hit after 01...
546      The alameda I have a video of a person stealin...
581                            Catalyctic converter theft.
620      Who sprayed the tables with some brown stain? .

In [37]:
df_app['final_label'] = dapp_new_topics
df_app.head()

Unnamed: 0,Incident_ID,Date_Created,Description,Service,len,lang,semi_label,final_label
0,899307,2022-01-01 00:43:26,Feldspar Dr & Senter Rd inside Spanish cove mo...,Other Issues,11,en,-1,8
1,899308,2022-01-01 00:45:43,Extremely big and loud fireworks going off,Other Issues,7,en,-1,6
2,899312,2022-01-01 01:52:17,Building #6 (I'd like to remain anonymous).,Other Issues,7,en,-1,0
3,899315,2022-01-01 03:57:23,Silver leaf and south sea ct. loud music all n...,Other Issues,15,en,-1,6
4,899372,2022-01-01 10:16:05,The right hand traffic signal at 28th street w...,Other Issues,19,en,-1,4


In [39]:
final_label_dict = {-1: 'Other',
              0:'Homeless/Panhandler',
              1:'Parking Violation',
              2:'Pothole',
              3:'Plant',
              4:'Traffic Light',
              5:'Water Leakage',
              6:'Noise',
              7:'Animal',
              8:'Location Only',
              9:'Street/Sidewalk Infra',
              10:'Trash',
              11:'Parks/Play Ground',
              12:'Suspecious People/Behavior',
              13:'Fence',
              14:'Account',
              15:'Speeding',
              16:'Restroom',
              17:'Junk Pick-up',
              18:'Utility Cable/Box',
              19:'Business(tax,license, permit etc)',
              20:'Sewage/Drainage',
              21:'Neighboorhood Complaint',}

In [40]:
df_app['final_label_verb'] = df_app['final_label'].apply(lambda x: final_label_dict[x])
df_app.head()

Unnamed: 0,Incident_ID,Date_Created,Description,Service,len,lang,semi_label,final_label,final_label_verb
0,899307,2022-01-01 00:43:26,Feldspar Dr & Senter Rd inside Spanish cove mo...,Other Issues,11,en,-1,8,Location Only
1,899308,2022-01-01 00:45:43,Extremely big and loud fireworks going off,Other Issues,7,en,-1,6,Noise
2,899312,2022-01-01 01:52:17,Building #6 (I'd like to remain anonymous).,Other Issues,7,en,-1,0,Homeless/Panhandler
3,899315,2022-01-01 03:57:23,Silver leaf and south sea ct. loud music all n...,Other Issues,15,en,-1,6,Noise
4,899372,2022-01-01 10:16:05,The right hand traffic signal at 28th street w...,Other Issues,19,en,-1,4,Traffic Light


In [41]:
df_app.to_csv('data/result/Result_uncat_pst_app.csv', index=False)