In [119]:
import pandas as pd
import numpy as np
import pycountry
from geopy.geocoders import Nominatim


In [120]:
# load the cleaned data
df = pd.read_csv('data/cleaned_data.csv')
df.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,"
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,"
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,Not-Specified,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,"
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,"
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi..."


## Finding countries of the offers
---

In [121]:
# get the country if exists in Location column
def get_country(location):
    try:
        # remove the special characters
        location = location.replace('.', '').replace(' or ', ', ').replace(' - ', ', ').replace('(', ', ').replace(' the ', ', ').replace(' & ', ', ').replace(';', ', ').replace(')', '')
        
        location_parts = location.split(', ')
        location_parts.reverse()

        # all countries and countries codes in the pycountry
        countries = [country.name for country in pycountry.countries]
        countries_codes = [country.alpha_3 for country in pycountry.countries]
        lower_countries_codes = [country.lower().strip() for country in countries_codes]
        lower_countries = [country.lower().strip() for country in countries]
        found_countries = []
        for part in location_parts:
            lower_location_part = part.lower()
            for index, country in enumerate(lower_countries):
                if country in lower_location_part or lower_countries_codes[index] in lower_location_part:
                    if len(found_countries) > 0:
                        return ", ".join(found_countries)
                    found_countries.append(countries[index])

        if found_countries:
            return ", ".join(found_countries)
        elif 'remote' in location.lower():
            found_countries.append('Remote')
        else:
            return get_country_from_city(location)
    except:
        return np.nan
    
# get the country from the city name
def get_country_from_city(city_name):
    geolocator = Nominatim(user_agent="city_country_lookup")
    # lower the city name
    city_name = city_name.lower()
    # ceparate by , and check for each part
    city_name = city_name.split(',')
    # reverse the list
    city_name.reverse()
    for part in city_name:
        location = geolocator.geocode(part, exactly_one=True, limit=1)
        if location is not None:
            country = str(location).split(',')[-1]
            return country.strip()
    if location is None:
        return np.nan

In [122]:
df['country'] = df['Location'].apply(get_country)

In [123]:
# null values in country column
df.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,country
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,",United States
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,",Mauritius
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,Not-Specified,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,",United States
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,",Italy
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi...",United States


## dealing with the salary currencies and asterisk
---

In [124]:
# from the salary column, get anything onther than numbers , + , * in a unique list
def get_salary_currency(salary):
    salary = salary.replace(' ', '').replace('+', '').replace('*', '').replace('K', '')
    salary = salary.translate(str.maketrans('', '', '0123456789'))
    return salary
currencies = []
currencies = df['Salary'].apply(get_salary_currency).unique()
currencies

array(['', 'Negociable', 'EUR', 'GBP'], dtype=object)

In [125]:
# drop Negociable and empty values from currencies
currencies = currencies[2:]
currencies

array(['EUR', 'GBP'], dtype=object)

In [126]:
# normalize the salary column to be in USD
def normalize_salary(salary):
    salary = salary.replace('+', '').replace('*', '').replace(' ', '').replace('K', '')
    if 'EUR' in salary:
        salary = salary.replace('EUR', '')
        return float(salary) * 1.22
    elif 'GBP' in salary:
        salary = salary.replace('GBP', '')
        return float(salary) * 1.39
    else:
        return salary
    
df['Salary_in_1000_USD'] = df['Salary'].apply(normalize_salary)

In [127]:
# function to add negoiable if salary contains '+' and asterisk if salary contains '*' 
def negociable_asterisk(salary):
    negociable_asterisk = []
    if '+' in salary:
        negociable_asterisk.append(1)
    else:
        negociable_asterisk.append(0)
    if '*' in salary:
        negociable_asterisk.append(1)
    else:
        negociable_asterisk.append(0)
    return negociable_asterisk

df['Negociable'] = df['Salary'].apply(negociable_asterisk).apply(lambda x: x[0])
df['Asterisk'] = df['Salary'].apply(negociable_asterisk).apply(lambda x: x[1])


In [128]:
df.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,country,Salary_in_1000_USD,Negociable,Asterisk
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,",United States,48,1,1
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,",Mauritius,48,1,1
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,Not-Specified,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,",United States,90,1,1
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,",Italy,48,1,1
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi...",United States,108,1,0


## Categorizing job offers in AI / DS / BD
---

In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Job titles data
job_titles = df['Job Title'].values

# Define the categories
categories = {
    'Data Science': [],
    'Artificial Intelligence': [],
    'Big Data': []
}

# Preprocess the job titles by converting to lowercase and removing special characters
preprocessed_titles = [" ".join(title.lower().split()) for title in job_titles]

# Use TF-IDF vectorizer to transform job titles into feature vectors with both unigrams and bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Consider both unigrams and bigrams
tfidf_matrix = vectorizer.fit_transform(preprocessed_titles)

# Get the feature names (words and bigrams)
feature_names = vectorizer.get_feature_names_out()

# Iterate through each category and identify the top keywords
for category in categories.keys():
    # Get the indices of job titles belonging to this category
    indices = [i for i, title in enumerate(job_titles) if category in title]
    if len(indices) > 0:
        # Compute the average TF-IDF score for each word/bigram across all job titles in the category
        avg_tfidf_scores = np.mean(tfidf_matrix[indices], axis=0).A1
        # Get the indices of top keywords (words/bigrams with highest average TF-IDF scores)
        top_keyword_indices = avg_tfidf_scores.argsort()[::-1][:20] 
        # Get the top keywords from feature names
        top_keywords = [feature_names[idx] for idx in top_keyword_indices]
        categories[category] = top_keywords

# Print the keywords for each category
for category, keywords in categories.items():
    # save in a table
    category
    print(f"Category: {category}")
    print("Keywords:", keywords)
    print()

Category: Data Science
Keywords: ['data science', 'science', 'data', 'manager', 'science manager', 'director', 'science intern', 'of', 'director data', 'intern', 'manager data', 'consultant', 'of data', 'lead', 'senior', 'head of', 'head', 'science lead', 'senior manager', 'scientist data']

Category: Artificial Intelligence
Keywords: ['artificial intelligence', 'artificial', 'intelligence', 'director artificial', 'intelligence machine', 'and machine', 'intelligence and', 'machine learning', 'machine', 'learning', 'director', 'lead', 'lead kpmg', 'kpmg', 'kpmg futures', 'futures', 'intelligence lead', 'and', 'engineer', 'hardware architect']

Category: Big Data
Keywords: ['big data', 'big', 'engineer', 'data', 'data engineer', 'senior big', 'engineer big', 'senior', 'architect big', 'sr big', 'hadoop', 'devops', 'solutions', 'sr', 'architect', 'software engineer', 'solutions architect', 'specialist', 'software', 'data machine']



In [130]:
def detect_category_with_spacy(job_title):
    # Define your predefined categories and their corresponding keywords/entities
    categories = {
        'Data Science': ['data science', 'science', 'data', 'manager', 'science manager', 'director', 'science intern', 'of', 'director data', 'intern', 'manager data', 'consultant', 'of data', 'lead', 'senior', 'head of', 'head', 'science lead', 'senior manager', 'scientist data'],
        'Artificial Intelligence': ['artificial intelligence', 'artificial', 'intelligence', 'director artificial', 'intelligence machine', 'and machine', 'intelligence and', 'machine learning', 'machine', 'learning', 'director', 'lead', 'lead kpmg', 'kpmg', 'kpmg futures', 'futures', 'intelligence lead', 'and', 'engineer', 'hardware architect'],
        'Big Data': ['big data', 'big', 'engineer', 'data', 'data engineer', 'senior big', 'engineer big', 'senior', 'architect big', 'sr big', 'hadoop', 'devops', 'solutions', 'sr', 'architect', 'software engineer', 'solutions architect', 'specialist', 'software', 'data machine']
    }
    job_title_lower = job_title.lower()

    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword in job_title_lower:
                return category

    return 'Other'

In [131]:
# Detect the category of each job title
df['Job_category'] = df['Job Title'].apply(detect_category_with_spacy)
df.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,country,Salary_in_1000_USD,Negociable,Asterisk,Job_category
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,",United States,48,1,1,Data Science
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,",Mauritius,48,1,1,Data Science
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,Not-Specified,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,",United States,90,1,1,Artificial Intelligence
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,",Italy,48,1,1,Data Science
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi...",United States,108,1,0,Data Science


In [132]:
# number of jobs in each category
df['Job_category'].value_counts()

Job_category
Data Science               2451
Artificial Intelligence     375
Other                       148
Big Data                     21
Name: count, dtype: int64

## Save the preprocessed data
---

In [133]:
# save the data
df.to_csv('data/preprocessed_data.csv', index=False)