In [1]:
import numpy as np
import pandas as pd
import json
import difflib
from bs4 import BeautifulSoup

from collections import defaultdict
from pandas.io.json import json_normalize

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout


Functions for reading and analysing the Indeed data

In [74]:
def read_data(filepath):
    file = open(filepath, 'r')
    text = file.read()
    dict = json.loads(text)
    data = dict['results']
    return json_normalize(data)

def analyse_df(df):
    print df.head()
    print df.describe()
    print df.dtypes
    print df.isnull().sum()
    print df.shape

def parse_summary(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text().lower()

In [75]:
filepath1 = './indeed_job_search_api/job_ad_fulltime_with_summary.txt'
filepath2 = './indeed_job_search_api/job_ad_parttime_with_summary.txt'

fulltime_data = read_data(filepath1)
parttime_data = read_data(filepath2)

In [76]:
gov_data = pd.read_csv('./UK Gender Pay Gap Data - 2017 to 2018.csv')
analyse_df(gov_data)

                           EmployerName  \
0       "Bryanston School",Incorporated   
1  "RED BAND" CHEMICAL COMPANY, LIMITED   
2                           118 LIMITED   
3                     123 EMPLOYEES LTD   
4                          1610 LIMITED   

                                             Address CompanyNumber SicCodes  \
0  Bryanston House,\r\nBlandford,\r\nDorset,\r\nU...      00226143    85310   
1  19, Smith's Place,\r\nLeith Walk,\r\nEdinburgh...      SC016876    47730   
2  Fusion Point,\r\nDumballs Road,\r\nCardiff,\r\...      03951948    61900   
3  34, Roundhay Road,\r\nLeeds,\r\nEngland,\r\nLS...      10530651    78300   
4  Hestercombe House,\r\nCheddon Fitzpaine,\r\nTa...      06727055    93110   

   DiffMeanHourlyPercent  DiffMedianHourlyPercent  DiffMeanBonusPercent  \
0                   18.0                     28.2                   0.0   
1                    2.3                     -2.7                  15.0   
2                    1.7                 

In [77]:
matches = pd.read_csv('./Indeed to GPG Company Name - indeed_companies_match.csv')

In [78]:
matches = matches.drop(['Verifier', 'Suggested GPG Matches'], axis=1)

In [79]:
matched_companies = matches[matches['Human-Verified GPG Match'] != "no match"]

Clean the text from the job summary column and replace in the dataframe

In [80]:
fulltime_data['jobsummary'] = fulltime_data['jobsummary'].apply(lambda x: parse_summary(x))
parttime_data['jobsummary'] = parttime_data['jobsummary'].apply(lambda x: parse_summary(x))

fulltime_data['Fulltime'] = True
parttime_data['Fulltime'] = False

analyse_df(fulltime_data)


               city                                            company  \
0           Croydon                                     Harris Careers   
1            London       King's College Hospital NHS Foundation Trust   
2            Oxford   Oxford University Hospitals NHS Foundation Trust   
3        Canterbury  Kent and Medway NHS and Social Care Partnershi...   
4  South Kensington                         Victoria and Albert Museum   

  country                           date  expired formattedLocation  \
0      GB  Fri, 22 Jun 2018 04:02:55 GMT    False       Croydon CR0   
1      GB  Fri, 22 Jun 2018 06:06:14 GMT    False        London SE5   
2      GB  Fri, 22 Jun 2018 06:17:06 GMT    False        Oxford OX3   
3      GB  Fri, 22 Jun 2018 06:06:13 GMT    False    Canterbury CT1   
4      GB  Fri, 22 Jun 2018 07:52:25 GMT    False  South Kensington   

  formattedLocationFull formattedRelativeTime  indeedApply            jobkey  \
0           Croydon CR0           4 hours ago   

In [81]:
indeed = pd.concat([fulltime_data, parttime_data])

In [82]:
indeed['company'] = indeed['company'].astype(str).map(lambda x: x.strip().lower())
analyse_df(indeed)

               city                                            company  \
0           Croydon                                     harris careers   
1            London       king's college hospital nhs foundation trust   
2            Oxford   oxford university hospitals nhs foundation trust   
3        Canterbury  kent and medway nhs and social care partnershi...   
4  South Kensington                         victoria and albert museum   

  country                           date  expired formattedLocation  \
0      GB  Fri, 22 Jun 2018 04:02:55 GMT    False       Croydon CR0   
1      GB  Fri, 22 Jun 2018 06:06:14 GMT    False        London SE5   
2      GB  Fri, 22 Jun 2018 06:17:06 GMT    False        Oxford OX3   
3      GB  Fri, 22 Jun 2018 06:06:13 GMT    False    Canterbury CT1   
4      GB  Fri, 22 Jun 2018 07:52:25 GMT    False  South Kensington   

  formattedLocationFull formattedRelativeTime  indeedApply            jobkey  \
0           Croydon CR0           4 hours ago   

In [83]:
matched_indeed = indeed.merge(matched_companies, how='inner', left_on="company", right_on="Indeed Company")
analyse_df(matched_indeed)

      city                                           company country  \
0  Croydon                                    harris careers      GB   
1   Oxford  oxford university hospitals nhs foundation trust      GB   
2   Oxford  oxford university hospitals nhs foundation trust      GB   
3   Oxford  oxford university hospitals nhs foundation trust      GB   
4   Oxford  oxford university hospitals nhs foundation trust      GB   

                            date  expired formattedLocation  \
0  Fri, 22 Jun 2018 04:02:55 GMT    False       Croydon CR0   
1  Fri, 22 Jun 2018 06:17:06 GMT    False        Oxford OX3   
2  Fri, 22 Jun 2018 06:05:56 GMT    False        Oxford OX3   
3  Fri, 22 Jun 2018 06:05:37 GMT    False        Oxford OX3   
4  Fri, 22 Jun 2018 06:04:57 GMT    False        Oxford OX3   

  formattedLocationFull formattedRelativeTime  indeedApply            jobkey  \
0           Croydon CR0           4 hours ago        False  5b20a1592d2070b8   
1            Oxford OX3     

In [84]:
gov_data['EmployerName'] = gov_data['EmployerName'].astype(str).map(lambda x: x.strip().lower())

In [85]:
complete_data = matched_indeed.merge(gov_data, how='inner', left_on="Human-Verified GPG Match", right_on="EmployerName")

In [86]:
analyse_df(complete_data)

      city                                           company country  \
0  Croydon                                    harris careers      GB   
1   Oxford  oxford university hospitals nhs foundation trust      GB   
2   Oxford  oxford university hospitals nhs foundation trust      GB   
3   Oxford  oxford university hospitals nhs foundation trust      GB   
4   Oxford  oxford university hospitals nhs foundation trust      GB   

                            date  expired formattedLocation  \
0  Fri, 22 Jun 2018 04:02:55 GMT    False       Croydon CR0   
1  Fri, 22 Jun 2018 06:17:06 GMT    False        Oxford OX3   
2  Fri, 22 Jun 2018 06:05:56 GMT    False        Oxford OX3   
3  Fri, 22 Jun 2018 06:05:37 GMT    False        Oxford OX3   
4  Fri, 22 Jun 2018 06:04:57 GMT    False        Oxford OX3   

  formattedLocationFull formattedRelativeTime  indeedApply            jobkey  \
0           Croydon CR0           4 hours ago        False  5b20a1592d2070b8   
1            Oxford OX3     

In [87]:
print(complete_data.columns)

Index([                     u'city',                   u'company',
                         u'country',                      u'date',
                         u'expired',         u'formattedLocation',
           u'formattedLocationFull',     u'formattedRelativeTime',
                     u'indeedApply',                    u'jobkey',
                      u'jobsummary',                  u'jobtitle',
                        u'language',               u'onmousedown',
                         u'snippet',                    u'source',
                       u'sponsored',                     u'state',
                        u'stations',                       u'url',
                        u'Fulltime',            u'Indeed Company',
        u'Human-Verified GPG Match',              u'EmployerName',
                         u'Address',             u'CompanyNumber',
                        u'SicCodes',     u'DiffMeanHourlyPercent',
         u'DiffMedianHourlyPercent',      u'DiffMeanBonusPerce

In [88]:
subset_data = complete_data[['company', 'jobtitle', 'DiffMeanHourlyPercent', 'DiffMedianHourlyPercent', 'jobsummary']]
#analyse_df(subset_data)

                                            company  \
0                                    harris careers   
1  oxford university hospitals nhs foundation trust   
2  oxford university hospitals nhs foundation trust   
3  oxford university hospitals nhs foundation trust   
4  oxford university hospitals nhs foundation trust   

                                       jobtitle  DiffMeanHourlyPercent  \
0  2018/19 Harris Graduate Programme: Secondary                   18.2   
1                            Genetics Scientist                   26.0   
2                             Foundation Year 2                   26.0   
3                               Linen Assistant                   26.0   
4                        Health Records Officer                   26.0   

                                          jobsummary  
0  if you are a recent graduate who is passionate...  
1  fixed term (12 months) clinical scientist - ba...  
2                                               none  
3  l

In [None]:
#df = pd.concat([df_fulltime, df_parttime]) # text files from with job summary
# df['Matches'] = df['company'].map(lambda x: difflib.get_close_matches(x, gov_data['EmployerName']))
# match_df = df.Matches.apply(pd.Series).rename(columns={0:'EmployerName'})
# df = pd.concat([match_df['EmployerName'],df], axis=1)
# pay_desc_df = pd.merge(pay_df,df, how="inner", on='EmployerName')
#analyse_df(pay_desc_df)

In [90]:
def tokenise_text(text):
    text = text.strip()
    tokenizer = RegexpTokenizer(r'\b[^\d\W]+\b')  # get rid of backslashes
    stop_words = set(stopwords.words('english'))
    tokens = tokenizer.tokenize(text.lower())
    words = [t for t in tokens if t not in stop_words]
    del stop_words
    return words


In [9]:
text1 = clean_summary_full.iloc[0]
#print(text1)
tokens = tokenise_text(text1)
#print(tokens)

In [92]:
subset_data['jobsummary'] = subset_data['jobsummary'].apply(lambda x: tokenise_text(x))

In [14]:
def find_word_occurrences(dataframe, word):
    count = 0
    for row in df_fulltime.iterrows():
        description = row[1]['jobsummary']
        if word.lower() in description:
            count += 1
            company = row[1]['company']
            title = row[1]['jobtitle']
            print(company + " | " + title)
        #    for sentence in split_into_sentences(description):
       #         if word in sentence:
      #              print(sentence)
     #       print('\n')
    #print('Total jobs: ' + str(count))

In [16]:
#find_word_occurrences(df_fulltime, 'macintyre')

In [None]:
#analyse_df(df)

#ad1 = fulltime_data[0]
#print(ad1)
#print(len(ad1))
#for item in ad1:
    #print item, "\n", ad1[item], "\n"

#print ad1["city"]

#for item in data[0:10]:
#    print item["city"], "\n"

# cities = []
# city_dict = defaultdict(int)
# for item in data:
#     city = item["city"]
#     if city not in city_dict:
#         city_dict[city] = 1
#         cities.append(city)
#     else:
#         city_dict[city] += 1

#print(len(cities))
#print(city_dict)

#title_and_summary = df[['jobtitle','jobsummary']]
#print(title_and_summary.head())