In [9]:
# Import necessary packages
import pandas as pd

In [10]:
# Load dataset
df = pd.read_csv('job_opportunities.csv', encoding='ISO-8859-1')

## Cleaning

#### Pre-cleaning

In [11]:
df.rename(columns={'Requirment of the company ': 'Requirment of the company'}, inplace=True)

In [12]:
# Drop last row
df.drop(df.loc[df['Company'].isnull()].index, axis='rows', inplace=True)

In [13]:
# Drop duplicated rows
df.drop_duplicates(inplace=True)

#### Salary

In [14]:
# Save null salary in another dataframe
df_salary_isnull = df.loc[df['Salary'].isnull()]

# Delete those null salary from the main dataframe
df.drop(df_salary_isnull.index, inplace=True)

In [15]:
# Create new column that contains stars True or False
df['Salary_has_star'] = df['Salary'].str.contains('\*')

In [16]:
# Take +, *, and K from salary 

df['Salary'] = df['Salary'].apply(lambda x: str.replace(x, '+', ''))
df['Salary'] = df['Salary'].apply(lambda x: str.replace(x, '*', ''))
df['Salary'] = df['Salary'].apply(lambda x: str.replace(x, 'K', ''))
df['Salary'] = df['Salary'].apply(lambda x: str.strip(x))

In [17]:
# Function is_number
def is_number(number):
    try:
        number = float(number)
        return isinstance(number, float)
    except:
        return False

In [18]:
# Function take_currency
import re
def take_currency(salary):
    salary = re.sub(r'[0-9+]', '', salary)
    return str.strip(salary)

In [19]:
# Function remove_currency
def remove_currency(salary):
    salary = re.sub(r'[^0-9]', '', salary)
    return salary

In [20]:
# Create new column that contain the currency
df['Currency'] = df['Salary'].apply(lambda x: 'USD' if is_number(x) else take_currency(x))

In [21]:
# Remove currency from salary
df['Salary'] = df['Salary'].apply(lambda x: remove_currency(x))

In [22]:
# Change salary type to float and multiply to 1000
df['Salary'] = df['Salary'].astype(float) * 1000

In [23]:
df.reset_index(drop=True, inplace=True)

In [24]:
# Normalize salary
from currency_converter import CurrencyConverter

converter = CurrencyConverter()

for i in range(len(df)):
    currency = df['Currency'][i]

    if currency != 'USD':
        df['Salary'][i] = converter.convert(df['Salary'][i], currency, "USD")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Salary'][i] = converter.convert(df['Salary'][i], currency, "USD")


In [25]:
# Drop the currency column after transforming the salary
df.drop('Currency', axis='columns', inplace=True)

#### Location

In [26]:
# Import Geotext
from geotext import GeoText

# Create country column
df['Country'] = df['Location'].apply(lambda x: GeoText(x).countries[0] if len(GeoText(x).countries) > 0 else None)

#### Requirement and facilities

In [27]:
# Change those to list
df['Requirment of the company'] = df['Requirment of the company'].apply(lambda x: str.split(x, ','))
df['Facilities'] = df['Facilities'].apply(lambda x: str.split(x, ','))

In [28]:
# Create function to delete all empty strings
def delete_empty_in_list(listname):
    while '' in listname:
        listname.remove('')
    return listname

In [29]:
# Remove empty strings on the lists
df['Requirment of the company'] = df['Requirment of the company'].apply(lambda x: delete_empty_in_list(x))
df['Facilities'] = df['Facilities'].apply(lambda x: delete_empty_in_list(x))

In [30]:
# Handle empty facilities
df['Facilities'] = df['Facilities'].apply(lambda x: ['No facilities'] if len(x) == 0 else x)

#### Experience level

In [31]:
# Handle experience level null values
df['Experience level'] = df['Experience level'].apply(lambda x: 'Not specified' if pd.isnull(x) else x)

## Stockage

In [32]:
# Import necessary packages
from sqlalchemy import create_engine, text, Integer, String, Float, Boolean, Date, Column, inspect, UniqueConstraint, ForeignKey
from sqlalchemy.orm import create_session, declarative_base, Relationship

username = 'SA'
password = 'YourPassword123'
hostname = 'localhost'
database = 'Job_opportunities'
driver = 'ODBC+Driver+18+for+SQL+Server'

connection_string = f'mssql+pyodbc://{username}:{password}@{hostname}/{database}?driver={driver}&Encrypt=No'

# Create sqlalchemy engine
engine = create_engine(connection_string)

In [33]:
# Create tables

# Create a base
Base = declarative_base()

# locations
class Locations(Base):
    __tablename__ = 'locations'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    location = Column(String(255), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(location, name='locations_location_UQ'),)

# facilities
class Facilities(Base):
    __tablename__ = 'facilities'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    facility = Column(String(255), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(facility, name='facilities_facility_UQ'),)

# job_types
class JobTypes(Base):
    __tablename__ = 'job_types'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    type = Column(String(50), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(type, name='job_types_type_UQ'),)

# experience_levels
class ExperienceLevels(Base):
    __tablename__ = 'experience_levels'
    
    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    level = Column(String(50), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(level, name='experience_levels_level_UQ'),)

# requirements
class Requirements(Base):
    __tablename__ = 'requirements'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    requirement = Column(String(255), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(requirement, name='requirements_requirement_UQ'),)

# jobs
class Jobs(Base):
    __tablename__ = 'jobs'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    title = Column(String, nullable=False)
    salary = Column(Float, nullable=False)
    salary_has_star = Column(Boolean, nullable=False)
    job_type_id = Column(Integer, ForeignKey('job_types.id'))
    location_id = Column(Integer, ForeignKey('locations.id'))
    experience_level_id = Column(Integer, ForeignKey('experience_levels.id'))

    # Relationships
    job_types_jobs = Relationship('job_types', backref='jobs')
    locations_jobs = Relationship('locations', backref='jobs')
    experience_levels_jobs = Relationship('experience_levels', backref='jobs')

# job_facilities
class JobFacilities(Base):
    __tablename__ = 'job_facilities'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    job_id = Column(Integer, ForeignKey('jobs.id'))
    facility_id = Column(Integer, ForeignKey('facilities.id'))

    # Relationships
    job_jf = Relationship('jobs', backref='job_facilities')
    facility_fj = Relationship('facilities', backref='job_facilities')

# job_requirements
class JobRequirements(Base):
    __tablename__ = 'job_requirements'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    requirement_id = Column(Integer, ForeignKey('requirements.id'))
    job_id = Column(Integer, ForeignKey('jobs.id'))

    # Relationships
    requirement_jr = Relationship('requirements', backref='job_requirements')
    job_jr = Relationship('jobs', backref='job_requirements')
try:
    Base.metadata.create_all(engine)
    print('Tables created successfully')
except Exception as ex:
    print(ex)

Tables created successfully


In [34]:
inspector = inspect(engine)

inspector.get_table_names()

['experience_levels',
 'facilities',
 'job_facilities',
 'job_requirements',
 'job_types',
 'jobs',
 'locations',
 'requirements']

#### Tendance clés en IA, DS, Big DATA

In [35]:
df.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,Salary_has_star,Country
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48000.0,"[Computer Science, Data quality, Genetics, Mat...",[No facilities],True,United States
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48000.0,"[Agile, Data management, Finance, Security]",[No facilities],True,Mauritius
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,Not specified,90000.0,"[Agile, Architecture, AWS, Computer Science, C...",[Career development],True,United States
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48000.0,"[Engineering, Industrial, Oracle, Power BI, R,...",[No facilities],True,Italy
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108000.0,"[AWS, Azure, Computer Science, Consulting, Dat...","[Flex hours, Flex vacation, Parental leave, Un...",False,United States


In [38]:
df.shape

(2824, 10)

In [234]:
# Create keywords for jobs
data_science_keywords = [
    'scientist', 'science', 'data analysis', 'data analyst', 'data mining', 'predictive modeling',
    'machine learning', 'statistical analysis', 'data visualization',
    'exploratory data analysis', 'data cleaning', 'feature engineering',
    'regression analysis', 'classification', 'clustering',
    'natural language processing', 'time series analysis', 'data product manager', 'research analyst', 'data analytics',
    'data quality', 'bi'
]

big_data_keywords = [
    'big', 'big data', 'data engineer', 'hadoop', 'apache spark', 'nosql', 'mapreduce',
    'distributed computing', 'data storage and retrieval', 'data scalability',
    'data volume', 'data velocity', 'data variety', 'data processing',
    'data architecture', 'data streaming', 'data lakes', 'streaming data pipelines', 'data architect', 'data storage'
]

ai_keywords = [
    'artificial intelligence', 'ai', 'machine learning', 'deep learning',
    'neural networks', 'natural language processing', 'computer vision',
    'reinforcement learning', 'robotics', 'expert systems',
    'cognitive computing', 'ai algorithms', 'sentiment analysis',
    'speech recognition', 'image recognition', 'autonomous systems'
]


In [225]:
# Function to classify job titles into DS, AI or Big Data
def classify_job(title):
    str.lower(title)
    if any(keyword in title for keyword in ai_keywords):
        return 'Artificial Intelligence'
    elif any(keyword in title for keyword in data_science_keywords):
        return 'Data Science'
    elif any(keyword in title for keyword in big_data_keywords):
        return 'Big Data'
    else :
        return 'Other'

In [223]:
title = df['Job Title'][0]
title

'Clinical Data Analyst'

In [224]:
classify_job(title)

'Other'

In [235]:
df['Job Category'] = df['Job Title'].apply(lambda x: classify_job(x.lower()))

In [230]:
df['Job Category'].unique()

array(['Data Science', 'Artificial Intelligence', 'Big Data', 'Other'],
      dtype=object)

In [236]:
df.loc[df['Job Category'] == 'Other']

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,Salary_has_star,Country,Job Category
10,YouGov,Graduate Power BI Developer,"Mumbai, India",Full Time,Entry-level,35000.0,"[Data Analytics, Excel, Finance, Market resear...",[Career development],True,India,Other
11,Bosch Group,SAP Consultant - Product Data Management,"Braga, Portugal",Full Time,Senior-level,62000.0,"[Data management, Engineering, R, Spark]",[Flex hours],True,Portugal,Other
16,Issuu,BI Analyst,Braga,Full Time,Not specified,48000.0,"[Business Analytics, Business Intelligence, Da...","[Competitive pay, Equity, Health care, Insurance]",True,,Other
20,Talan,Data Management Scrum Master,"Málaga, Spain",Full Time,Entry-level,39000.0,"[Agile, Azure, Big Data, Blockchain, Databrick...",[No facilities],True,Spain,Other
25,Publicis Groupe,BI Developer,"New York City, United States",Full Time,Senior-level,106000.0,"[Agile, Architecture, Business Intelligence, D...",[Health care],False,United States,Other
...,...,...,...,...,...,...,...,...,...,...,...
2759,Audigent,Mid-Level Software Engineer - Data Team,"United Kingdom, Europe, Remote",Full Time,Mid-level,45000.0,"[Airflow, APIs, Athena, AWS, Azure, Big Data]",[Startup environment],True,United Kingdom,Other
2794,OpenAI,"Software Engineer, Model Inference","San Francisco, California, United States",Full Time,Senior-level,200000.0,"[AGI, APIs, Architecture, Azure, ChatGPT, CUDA]","[Career development, Equity, Flex vacation, He...",False,United States,Other
2800,Informa Group Plc.,Data Operations Engineer,"Toronto, ON, Canada",Full Time,Not specified,60000.0,"[Agile, AWS, DataOps, Engineering, ETL, Pipeli...","[Career development, Flex hours, Flex vacation...",True,Canada,Other
2811,Galileo Financial Technologies,Staff ETL Developer,UT - Remote; UT - Cottonwood Heights,Full Time,Senior-level,63000.0,"[Computer Science, Data pipelines, Data wareho...","[Flex hours, Health care, Insurance]",True,,Other


In [75]:
# Big data
df.loc[df['Job Title'].str.contains('Big') | df['Job Title'].str.contains('BigData') | df['Job Title'].str.contains('Bigdata') | df['Job Title'].str.contains('Big - Data')].shape

(41, 10)

In [85]:
# Artificial Intelligence
df.loc[df['Job Title'].str.contains(str.capitalize('Artificial'))].shape

(19, 10)

In [95]:
# Data Science
df.loc[df['Job Title'].str.contains(str('Data Science')) | df['Job Title'].str.contains('Scientist')].shape

(557, 10)

In [98]:
df.loc[~df['Job Title'].str.contains('Data') & df['Job Title'].str.contains('Scientist')]

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,Salary_has_star,Country
143,Altos Labs,"Scientist/Senior Scientist, Machine Learning","San Francisco Bay Area, CA",Full Time,Senior-level,69000.0,"[Computer Science, Data analysis, Deep Learnin...","[Career development, Conferences]",True,
175,"Nuro, Inc.",Machine Learning Research Scientist,"Mountain View, California (HQ)",Full Time,Entry-level,167000.0,"[Computer Science, Data pipelines, Deep Learni...","[Career development, Competitive pay, Conferen...",False,
183,Publicis Groupe,"Senior Scientist, Decision Sciences","Irving, TX, United States",Full Time,Senior-level,69000.0,"[Big Data, Computer Science, Data analysis, Da...","[Career development, Competitive pay, Health c...",True,United States
199,"Nuro, Inc.",Machine Learning Research Scientist - Reinforc...,"Mountain View, California (HQ)",Full Time,Entry-level,167000.0,"[Computer Science, Deep Learning, ICLR, ICML, ...","[Career development, Competitive pay, Conferen...",False,
217,NobleAI,Research Scientist,"San Francisco, California, United States",Full Time,Mid-level,158000.0,"[Biology, Chemistry, Computer Science, Deep Le...","[401(k) matching, Career development, Equity, ...",False,United States
...,...,...,...,...,...,...,...,...,...,...
2729,Intercom,Senior Machine Learning Scientist,"Dublin, Ireland",Full Time,Senior-level,129000.0,"[Bayesian, Clustering, Data analysis, Deep Lea...","[Career development, Competitive pay, Equity, ...",True,Ireland
2767,Spotify,"Research Scientist, Computational Economics",London,Full Time,Senior-level,89000.0,"[Computer Science, Economics, Machine Learning...",[Conferences],True,
2769,23andMe,"Scientist I/II, Computational Biology - Target...","South San Francisco, California",Full Time,Senior-level,128000.0,"[AWS, Azure, Biology, Data analysis, Drug disc...",[Health care],False,
2771,Amazon.com,"Applied Scientist, Generative AI, Creative X","Seattle, Washington, USA",Full Time,Senior-level,136000.0,"[A/B testing, Big Data, Computer Science, Comp...","[Career development, Conferences, Equity, Star...",False,


In [96]:
df.loc[~df['Job Title'].str.contains('Data Science') & ~df['Job Title'].str.contains('Scientist') & ~df['Job Title'].str.contains('Big') & ~df['Job Title'].str.contains('Artificial')]

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,Salary_has_star,Country
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48000.0,"[Computer Science, Data quality, Genetics, Mat...",[No facilities],True,United States
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48000.0,"[Agile, Data management, Finance, Security]",[No facilities],True,Mauritius
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,Not specified,90000.0,"[Agile, Architecture, AWS, Computer Science, C...",[Career development],True,United States
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48000.0,"[Engineering, Industrial, Oracle, Power BI, R,...",[No facilities],True,Italy
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108000.0,"[AWS, Azure, Computer Science, Consulting, Dat...","[Flex hours, Flex vacation, Parental leave, Un...",False,United States
...,...,...,...,...,...,...,...,...,...,...
2816,ServiceNow,Sr Software QA Engineer-Machine Learning QE,"Santa Clara, California, United States",Full Time,Senior-level,117000.0,"[Engineering, Git, JavaScript, Machine Learnin...","[401(k) matching, Competitive pay, Equity, Fle...",False,United States
2817,Marley Spoon,People Data Specialist,"Lisbon, Lisbon, Portugal",Full Time,Mid-level,70000.0,"[Data analysis, Excel, Finance]","[Career development, Equity, Flex hours, Flex ...",True,Portugal
2818,TripAdvisor,Senior Machine Learning Engineer (Trip Planning),"Needham, Massachusetts, MA",Full Time,Senior-level,170000.0,"[Big Data, Computer Science, Computer Vision, ...","[Career development, Conferences, Flex hours, ...",False,
2819,CCRi,"Application Integration Engineer, Computer Vis...","Chantilly, Virginia, United States",Full Time,Mid-level,113000.0,"[Agile, Angular, APIs, Architecture, AWS, Azure]","[401(k) matching, Career development, Flex hou...",False,United States


In [72]:
df['Job Title'][1389]

'Data Engineer - Concepteur - Développeur SQL - BigL'