In [1]:
import pandas as pd
import re
from datetime import datetime

In [2]:
events = pd.read_csv("data/Dataset1.csv")
accidents = pd.read_csv("data/Dataset2.csv")

### Fixing Columns

In [3]:
events.rename(str.lower, axis='columns', inplace=True)
accidents.rename(str.lower, axis='columns', inplace=True)
events.columns = events.columns.str.replace("\s", '_', regex=True).str.replace("\W", '', regex=True)
events.rename(columns=lambda x: x[re.search("[a-zA-Z]", x).start():], inplace=True)
accidents.columns = accidents.columns.str.replace("\s", '_', regex=True).str.replace("\W", '', regex=True)
accidents.rename(columns=lambda x: x[re.search("[a-zA-Z]", x).start():], inplace=True)

In [4]:
# primary occupation code = North American Industry Classification System codes (NAICS)
# National Occupational Classification (NOC)
events = events.rename(columns={'event_date':'date', 'municipality':'city', 'primary_occupation_code':'NAICS'})
accidents = accidents.rename(columns={'accident_date':'date', 'organization_city':'city', 'occupation_code':'NOC'})

### Date Field

In [5]:
events['date'] = pd.to_datetime(events['date'], format="mixed")
events = events[events['date'] < datetime.now()]

In [6]:
accidents['date'] = pd.to_datetime(accidents['date'], errors='coerce')
accidents = accidents.dropna(subset=['date']).reset_index(drop=True)
accidents = accidents[accidents['date'] < "2024-12-31"]

### Cleaning Text Fields

In [7]:
def fix_text(s:pd.Series)->pd.Series:
    s = s.str.replace("\s", '_', regex=True)
    s = s.str.replace("\W", '', regex=True)
    s.fillna('', inplace=True)
    s = s.astype(str)
    s = s.apply(str.lower)
    return s

In [8]:
events['city'] = fix_text(events['city'])

In [9]:
print(list(events['city'].unique()))

['scarborough', 'north_bay', 'whitby', 'toronto', 'sault_ste_marie', 'timmins', 'hamilton', 'stoney_creek', 'thunder_bay', 'ottawa', 'lively', 'tillsonburg', 'brougham', 'brampton', 'bowmanville', 'burlington', 'north_york', 'london', 'kitchener', 'mississauga', 'oshawa', 'springwater_twp', 'elliot_lake', 'ajax', 'dryden', 'milton', 'georgetown', 'pickering', 'pontypool', '', 'ingersoll', 'lorignal', 'orillia', 'tiverton', 'holland_landing', 'owen_sound', 'hannon', 'ancaster', 'newmarket', 'sudbury', 'stratford', 'huntsville', 'windsor', 'peterborough', 'port_hope', 'st_catharines', 'vaughan', 'balmertown', 'kapuskasing', 'brantford', 'mount_forest', 'richmond_hill', 'woodbridge', 'markham', 'fort_albany', 'picton', 'caledonia', 'kingston', 'chapleau', 'tilbury', 'parkhill', 'collingwood', 'waubaushene', 'campbellville', 'guelph', 'welland', 'fort_erie', 'hawkesbury', 'chaput_hughes', 'marathon', 'barrie', 'stouffville', 'cambridge', 'cochrane', 'bradford', 'nepean', 'fergus', 'innisfi

In [10]:
accidents['city'] = fix_text(accidents['city'])
accidents['industry_sector_description'] = fix_text(accidents['industry_sector_description'])
print(list(accidents['industry_sector_description'].unique()))

['transportation', 'industrial', 'manufacturing', 'construction', 'services', 'health_care', 'educationindustrial', 'municipal', 'agriculture', 'food', 'electrical', 'mining', 'chemicalprocess', 'not_assigned']


#### Occupation Description

In [11]:
events['occupation_description'] = events['occupation_description'].str.lower()
accidents['occupation_description'] = accidents['occupation_description'].str.lower()

#### NOC

In [12]:
accidents['NOC'] = accidents['NOC'].apply(lambda x: pd.NA if re.findall("\D", x) else x)
# accidents = accidents.dropna(subset=['NOC'])
accidents

Unnamed: 0,date,worker_age,occupation_category_code,occupation_category_description,NOC,occupation_description,worker_experience_in_years,accident_source_category_description1,source_category_description2,accident_category_description,accident_place_description,city,organization_province_code,industry_sector_description
0,2023-10-14,58.0,72,FACILITY OPERATION AND MAINTENANCE MANAGERS,722,facility operation managers,0.5,FLOORS,UNKNOWN,FALL ON SAME LEVEL,PLACE NOT SPECIFIED,markham,ON,transportation
1,2023-11-04,29.0,732,MOTOR VEHICLE MECHANICS,7322,"motor vehicle mechanics, technicians and mecha...",0.5,TRUCK,"VEHICLE & MOBILE EQUIP. PARTS, N.E.C.",CAUGHT IN OR COMPRESSED BY EQUIP./OBJECTS,FARM.,winnipeg,MB,industrial
2,2023-12-17,30.0,745,LONGSHORE WORKERS AND MATERIAL HANDLERS,7453,material handlers,0.5,"PLANT & INDUSTRIAL POWERED VEHICLE, N.E.C.","BODILY MOTION OR POSITION OF INJURED, ILL WORKER",STRUCK BY OBJECT,"RESIDENTIAL INSTITUTION. (HOSPITALS, ORPHANAGE...",harriston,ON,industrial
3,2023-11-13,64.0,961,"LABOURERS IN PROCESSING, MANUFACTURING AND UTI...",9613,labourers in metal fabrication,0.5,PARKING LOTS,WEATHER AND ATMOSPHERIC CONDITIONS,FALL ON SAME LEVEL,PLACE FOR SPORTS AND RECREATION.,markham,ON,manufacturing
4,2023-11-13,74.0,741,MOTOR VEHICLE AND TRANSIT DRIVERS,7412,truck drivers,0.5,OTHER STRUCTURES,WEATHER AND ATMOSPHERIC CONDITIONS,FALL ON SAME LEVEL,PUBLIC BUILDINGS.,newcastle,ON,industrial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15445,2022-11-25,51.0,961,"LABOURERS IN PROCESSING, MANUFACTURING AND UTI...",9619,"other labourers in processing, manufacturing a...",0.5,"STAIRS, STEPS","BODILY MOTION OR POSITION OF INJURED, ILL WORKER",FALL TO LOWER LEVEL,HOME.,toronto,ON,construction
15446,2023-07-24,57.0,761,TRADES HELPERS AND LABOURERS,7612,construction trades helpers and labourers,0.5,TRACTOR,UNKNOWN,"PEDESTRIAN STRUCK BY VEHICLE, MOBILE EQUIPMENT",INDUSTRIAL SITE.,dunchurch,ON,construction
15447,2023-07-05,60.0,735,STATIONARY ENGINEERS & POWER STATION & SYSTEM...,7353,power systems and power station operators,0.5,"BODILY MOTION OR POSITION OF INJURED, ILL WORKER","BODILY MOTION OR POSITION OF INJURED, ILL WORKER",REPETITIVE MOTION,INDUSTRIAL SITE.,ajax,ON,manufacturing
15448,2022-05-05,42.0,341,ASSISTING OCCUPATIONS IN SUPPORT OF HEALTH SER...,3414,nurse aides and orderlies,0.5,HEALTH CARE PATIENT OR RESIDENT OF HEALTH CARE...,UNKNOWN,"ASSAULTS, VIOLENT ACTS AND HARASSMENT BY PERSO...",PLACE NOT SPECIFIED,toronto,ON,health_care


### Age

In [13]:
accidents = accidents[accidents['worker_age'] <= 100]
accidents['worker_experience_in_years'] = accidents['worker_experience_in_years'].fillna(0)
accidents = accidents[accidents['worker_age'] >= accidents['worker_experience_in_years']]
accidents

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accidents['worker_experience_in_years'] = accidents['worker_experience_in_years'].fillna(0)


Unnamed: 0,date,worker_age,occupation_category_code,occupation_category_description,NOC,occupation_description,worker_experience_in_years,accident_source_category_description1,source_category_description2,accident_category_description,accident_place_description,city,organization_province_code,industry_sector_description
0,2023-10-14,58.0,72,FACILITY OPERATION AND MAINTENANCE MANAGERS,722,facility operation managers,0.5,FLOORS,UNKNOWN,FALL ON SAME LEVEL,PLACE NOT SPECIFIED,markham,ON,transportation
1,2023-11-04,29.0,732,MOTOR VEHICLE MECHANICS,7322,"motor vehicle mechanics, technicians and mecha...",0.5,TRUCK,"VEHICLE & MOBILE EQUIP. PARTS, N.E.C.",CAUGHT IN OR COMPRESSED BY EQUIP./OBJECTS,FARM.,winnipeg,MB,industrial
2,2023-12-17,30.0,745,LONGSHORE WORKERS AND MATERIAL HANDLERS,7453,material handlers,0.5,"PLANT & INDUSTRIAL POWERED VEHICLE, N.E.C.","BODILY MOTION OR POSITION OF INJURED, ILL WORKER",STRUCK BY OBJECT,"RESIDENTIAL INSTITUTION. (HOSPITALS, ORPHANAGE...",harriston,ON,industrial
3,2023-11-13,64.0,961,"LABOURERS IN PROCESSING, MANUFACTURING AND UTI...",9613,labourers in metal fabrication,0.5,PARKING LOTS,WEATHER AND ATMOSPHERIC CONDITIONS,FALL ON SAME LEVEL,PLACE FOR SPORTS AND RECREATION.,markham,ON,manufacturing
4,2023-11-13,74.0,741,MOTOR VEHICLE AND TRANSIT DRIVERS,7412,truck drivers,0.5,OTHER STRUCTURES,WEATHER AND ATMOSPHERIC CONDITIONS,FALL ON SAME LEVEL,PUBLIC BUILDINGS.,newcastle,ON,industrial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15445,2022-11-25,51.0,961,"LABOURERS IN PROCESSING, MANUFACTURING AND UTI...",9619,"other labourers in processing, manufacturing a...",0.5,"STAIRS, STEPS","BODILY MOTION OR POSITION OF INJURED, ILL WORKER",FALL TO LOWER LEVEL,HOME.,toronto,ON,construction
15446,2023-07-24,57.0,761,TRADES HELPERS AND LABOURERS,7612,construction trades helpers and labourers,0.5,TRACTOR,UNKNOWN,"PEDESTRIAN STRUCK BY VEHICLE, MOBILE EQUIPMENT",INDUSTRIAL SITE.,dunchurch,ON,construction
15447,2023-07-05,60.0,735,STATIONARY ENGINEERS & POWER STATION & SYSTEM...,7353,power systems and power station operators,0.5,"BODILY MOTION OR POSITION OF INJURED, ILL WORKER","BODILY MOTION OR POSITION OF INJURED, ILL WORKER",REPETITIVE MOTION,INDUSTRIAL SITE.,ajax,ON,manufacturing
15448,2022-05-05,42.0,341,ASSISTING OCCUPATIONS IN SUPPORT OF HEALTH SER...,3414,nurse aides and orderlies,0.5,HEALTH CARE PATIENT OR RESIDENT OF HEALTH CARE...,UNKNOWN,"ASSAULTS, VIOLENT ACTS AND HARASSMENT BY PERSO...",PLACE NOT SPECIFIED,toronto,ON,health_care


### Saving

In [14]:
events.to_csv('cleaned_data/events.csv', index=0)
accidents.to_csv('cleaned_data/accidents.csv', index=0)