In [1]:
# Import appropriate libraries
import numpy as np
import csv
import matplotlib.pyplot as plt
import pandas as pd
import datetime


In [2]:
# Read in csv data file
data = pd.read_csv("data.csv", low_memory=False)

In [3]:
# Display data variables and their types
data.dtypes

Id                   object
Title                object
Company              object
Date                 object
Location             object
Area                 object
Classification       object
SubClassification    object
Requirement          object
FullDescription      object
LowestSalary          int64
HighestSalary         int64
JobType              object
dtype: object

In [4]:
# Part 1: Data Preparation and Preprocessing
# Delete any rows where any variable is missing a variable(NaN)
data = data[data['Id'].notna()]
data = data[data['Title'].notna()]
data = data[data['Company'].notna()]
data = data[data['Date'].notna()]
data = data[data['Location'].notna()]
data = data[data['Area'].notna()]
data = data[data['Classification'].notna()]
data = data[data['SubClassification'].notna()]
data = data[data['Requirement'].notna()]
data = data[data['FullDescription'].notna()]
data = data[data['LowestSalary'].notna()]
data = data[data['HighestSalary'].notna()]
data = data[data['JobType'].notna()]


In [5]:
# Clean data by removing Id as it won't be used
del data["Id"]
# Format FullDescription removing html formats from the text
# Create function to remove the html tags from the descriptions
def del_html(string):
    import re
    # Clean FullDescription by removing all html tags, newline characters, and asterisks present in the strings
    filtered = re.compile('<.*?>|&nbsp;|&amp;|\u260e|\n|[*]', re.DOTALL|re.M)
    return re.sub(filtered, '', string)
data['FullDescription'] = data['FullDescription'].apply(del_html)
data.head()

Unnamed: 0,Title,Company,Date,Location,Area,Classification,SubClassification,Requirement,FullDescription,LowestSalary,HighestSalary,JobType
121,Fabricator/Installer,WORKPLACE ACCESS & SAFETY,2018-10-07T00:00:00.000Z,Melbourne,Bayside & South Eastern Suburbs,Trades & Services,Welders & Boilermakers,Trade qualified person with skills in welding ...,Secure long term role with genuine ca...,0,30,Full Time
122,Boilermaker,RPM Contracting QLD P/l,2018-10-07T00:00:00.000Z,Brisbane,Southern Suburbs & Logan,Trades & Services,Welders & Boilermakers,Perm rate $30. Structural steel fab & weld out...,One of Australia's best engineering workshops ...,0,30,Full Time
125,Casual Childcare Positions | Bondi Junction,anzuk Education,2018-10-07T00:00:00.000Z,Sydney,"CBD, Inner West & Eastern Suburbs",Education & Training,Teaching - Early Childhood,"anzuk education are searching for reliable, en...",What is anzuk? anzuk Early Childhoodis a rec...,0,30,Contract/Temp
126,Technician,Zoom Recruitment & Training,2018-10-07T00:00:00.000Z,Sydney,South West & M5 Corridor,Engineering,Mechanical Engineering,"This Australian Icon, connects the people of t...","This Australian Icon, connects the people of t...",0,30,Full Time
127,Systems Engineer,Humanised Group,2018-10-07T00:00:00.000Z,Brisbane,CBD & Inner Suburbs,Information & Communication Technology,Networks & Systems Administration,Systems Engineer to work on BAU/Projects for a...,The Company This organisation is well-establi...,0,30,Full Time


In [6]:
# Normalize the data by converting Date from object to Datetime data type
data['Date'] = pd.to_datetime(data['Date'])
data.dtypes

Title                             object
Company                           object
Date                 datetime64[ns, UTC]
Location                          object
Area                              object
Classification                    object
SubClassification                 object
Requirement                       object
FullDescription                   object
LowestSalary                       int64
HighestSalary                      int64
JobType                           object
dtype: object

In [29]:
# Part 2: Data Analysis and Intepretation
# Determine job Classification and SubClassifications
# Extract the job title, classification, and subclassification
data1 = data[['Title','Classification','SubClassification']]
# Group the Classification and subclassification to measure classification and subclassification count
data1 = data1.groupby(['Classification','SubClassification']).size().reset_index().rename(columns={0:'Count'})
# Sort classifications and the counts together - this shows the most popular sub classification within each classification
data1 = data1.sort_values(by=['Classification','Count'], ascending=False)
data1 = data1.reset_index()
print(data1.to_string())


     index                          Classification                                SubClassification  Count
0      372                       Trades & Services                                Automotive Trades   1601
1      378                       Trades & Services                                     Electricians    806
2      383                       Trades & Services                                        Labourers    691
3      379                       Trades & Services                    Fitters, Turners & Machinists    621
4      382                       Trades & Services                           Hair & Beauty Services    601
5      394                       Trades & Services                           Welders & Boilermakers    546
6      393                       Trades & Services                                      Technicians    526
7      376                       Trades & Services                       Carpentry & Cabinet Making    524
8      381                       Trad

In [121]:
# Determine Job locations
data2 = data[['Title','Location']]
data2 = data2.groupby(['Title','Location']).size().reset_index().rename(columns={0:'Count'})
# Sort classifications and the counts together - this shows the most popular sub classification within each classification
data2 = data2.sort_values(by=['Location'], ascending=False)
pd.set_option('display.max_rows',None)
print(data2)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

