In [1]:
# Import necessary packages
import pandas as pd

In [2]:
# Load dataset
df = pd.read_csv('job_opportunities.csv', encoding='ISO-8859-1')

## Cleaning

#### Pre-cleaning

In [3]:
df.rename(columns={'Requirment of the company ': 'Requirment of the company'}, inplace=True)

# Drop last row
df.drop(df.loc[df['Company'].isnull()].index, axis='rows', inplace=True)

# Drop duplicated rows
df.drop_duplicates(inplace=True)

#### Salary

In [4]:
# Save null salary in another dataframe
df_salary_isnull = df.loc[df['Salary'].isnull()]

# Delete those null salary from the main dataframe
df.drop(df_salary_isnull.index, inplace=True)

# Create new column that contains stars True or False
df['Salary_has_star'] = df['Salary'].str.contains('\*')

# Take +, *, and K from salary 
df['Salary'] = df['Salary'].apply(lambda x: str.replace(x, '+', ''))
df['Salary'] = df['Salary'].apply(lambda x: str.replace(x, '*', ''))
df['Salary'] = df['Salary'].apply(lambda x: str.replace(x, 'K', ''))
df['Salary'] = df['Salary'].apply(lambda x: str.strip(x))

In [5]:
# Function is_number
def is_number(number):
    try:
        number = float(number)
        return isinstance(number, float)
    except:
        return False
    
# Function take_currency
import re
def take_currency(salary):
    salary = re.sub(r'[0-9+]', '', salary)
    return str.strip(salary)

# Function remove_currency
def remove_currency(salary):
    salary = re.sub(r'[^0-9]', '', salary)
    return salary

In [6]:
# Create new column that contain the currency
df['Currency'] = df['Salary'].apply(lambda x: 'USD' if is_number(x) else take_currency(x))

# Remove currency from salary
df['Salary'] = df['Salary'].apply(lambda x: remove_currency(x))

# Change salary type to float and multiply to 1000
df['Salary'] = df['Salary'].astype(float) * 1000

df.reset_index(drop=True, inplace=True)

In [7]:
# Normalize salary
from currency_converter import CurrencyConverter

converter = CurrencyConverter()

for i in range(len(df)):
    currency = df['Currency'][i]

    if currency != 'USD':
        df['Salary'][i] = converter.convert(df['Salary'][i], currency, "USD")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Salary'][i] = converter.convert(df['Salary'][i], currency, "USD")


In [8]:
# Drop the currency column after transforming the salary
df.drop('Currency', axis='columns', inplace=True)

#### Location

In [9]:
df['Location'] = df['Location'].apply(lambda x: str.lower(str.strip(x)))

In [10]:
# # Import Geotext
# from geotext import GeoText

# # Create country column
# df['Country'] = df['Location'].apply(lambda x: GeoText(x).countries[0] if len(GeoText(x).countries) > 0 else None)

#### Requirement and facilities

In [11]:
# Change those to list
df['Requirment of the company'] = df['Requirment of the company'].apply(lambda x: str.split(x, ','))
df['Facilities'] = df['Facilities'].apply(lambda x: str.split(x, ','))

In [12]:
# Create function to delete all empty strings
def delete_empty_in_list(listname):
    while '' in listname:
        listname.remove('')
    return listname

In [13]:
# Remove empty strings on the lists
df['Requirment of the company'] = df['Requirment of the company'].apply(lambda x: delete_empty_in_list(x))
df['Facilities'] = df['Facilities'].apply(lambda x: delete_empty_in_list(x))

In [14]:
# Handle empty facilities
df['Facilities'] = df['Facilities'].apply(lambda x: ['No facilities'] if len(x) == 0 else x)

#### Experience level

In [15]:
# Handle experience level null values
df['Experience level'] = df['Experience level'].apply(lambda x: 'Not specified' if pd.isnull(x) else x)

#### Job category

In [16]:
# Create keywords for jobs
data_science_keywords = [
    'scientist', 'science', 'data analysis', 'data analyst', 'data mining', 'predictive modeling',
    'machine learning', 'statistical analysis', 'data visualization',
    'exploratory data analysis', 'data cleaning', 'feature engineering',
    'regression analysis', 'classification', 'clustering',
    'natural language processing', 'time series analysis', 'data product manager', 'research analyst', 'data analytics',
    'data quality', 'bi', 'business intelligence', 'data management', 'data project management', 'digital analytics',
    'data modeler', 'data product owner', 'cloud database analyst', 'data manager', 'data strategy', 'data specialist',
    'analytics engineer', 'master data', 'data operations', 'data operator', 'dataops', 'data strategist', 'data systems',
    'data reporter', 'data and control systems', 'data developer', 'data analyse', 'data visualisation', 'analyst',
    'data strategies', 'head of data', 'ml'
]

big_data_keywords = [
    'big', 'big data', 'data engineer', 'data enginner','hadoop', 'apache spark', 'spark', 'nosql', 'mapreduce',
    'distributed computing', 'data storage and retrieval', 'data scalability', 'etl',
    'data volume', 'data velocity', 'data variety', 'data processing',
    'data architecture', 'data streaming', 'data lakes', 'streaming data pipelines', 'data architect', 'data storage',
    'data pipeline', 'data platform', 'dataset', 'databricks', 'data integrations', 'data infrastructure', 'data integration',
    'database engineer', 'data lake', 'data modeller', 'data production', 'cloud data', 'data modelling', 'data modeling',
    'database tools'
]

ai_keywords = [
    'artificial intelligence', 'ai', 'machine learning', 'deep learning',
    'neural networks', 'natural language processing', 'computer vision',
    'reinforcement learning', 'robotics', 'expert systems',
    'cognitive computing', 'ai algorithms', 'sentiment analysis',
    'speech recognition', 'image recognition', 'autonomous systems', 'ml',
    'autonomous', 'autonomy', 'robotic', 'vision', 'text analytics', 'chatbot', 'nlp', 'model inference'
]

In [17]:
# Function to classify job titles into DS, AI or Big Data
def classify_job(title):
    str.lower(title)
    categories = []
    if any(keyword in title for keyword in ai_keywords):
        categories.append('Artificial Intelligence')
    if any(keyword in title for keyword in data_science_keywords):
        categories.append('Data Science')
    if any(keyword in title for keyword in big_data_keywords):
        categories.append('Big Data')
    if not categories :
        categories.append('Other')
    return categories

In [18]:
df['Job Category'] = df['Job Title'].apply(lambda x: classify_job(x.lower()))

## Stockage

#### Tendance clés en IA, DS, Big DATA

In [22]:
# Import necessary packages
from sqlalchemy import create_engine, text, Integer, String, Float, Boolean, Date, Column, inspect, UniqueConstraint, ForeignKey
from sqlalchemy.orm import create_session, declarative_base, Relationship, sessionmaker

username = 'SA'
password = 'YourPassword123'
hostname = 'localhost'
database = 'Job_opportunities'
driver = 'ODBC+Driver+18+for+SQL+Server'

connection_string = f'mssql+pyodbc://{username}:{password}@{hostname}/{database}?driver={driver}&Encrypt=No'

# Create sqlalchemy engine
engine = create_engine(connection_string)

In [23]:
# Create tables

# Create a base
Base = declarative_base()

# locations
class Locations(Base):
    __tablename__ = 'locations'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    location = Column(String(255), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(location, name='locations_location_UQ'),)

# facilities
class Facilities(Base):
    __tablename__ = 'facilities'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    facility = Column(String(255), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(facility, name='facilities_facility_UQ'),)

# job_types
class JobTypes(Base):
    __tablename__ = 'job_types'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    type = Column(String(50), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(type, name='job_types_type_UQ'),)

# experience_levels
class ExperienceLevels(Base):
    __tablename__ = 'experience_levels'
    
    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    level = Column(String(50), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(level, name='experience_levels_level_UQ'),)

# requirements
class Requirements(Base):
    __tablename__ = 'requirements'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    requirement = Column(String(255), nullable=False)

    # Constraints
    __table_args__ = (UniqueConstraint(requirement, name='requirements_requirement_UQ'),)

# jobs
class Jobs(Base):
    __tablename__ = 'jobs'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    title = Column(String, nullable=False)
    salary = Column(Float, nullable=False)
    salary_has_star = Column(Boolean, nullable=False)
    job_type_id = Column(Integer, ForeignKey('job_types.id'))
    location_id = Column(Integer, ForeignKey('locations.id'))
    experience_level_id = Column(Integer, ForeignKey('experience_levels.id'))

    # Relationships
    job_types_jobs = Relationship('JobTypes', backref='jobs')
    locations_jobs = Relationship('Locations', backref='jobs')
    experience_levels_jobs = Relationship('ExperienceLevels', backref='jobs')

# job_facilities
class JobFacilities(Base):
    __tablename__ = 'job_facilities'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    job_id = Column(Integer, ForeignKey('jobs.id'))
    facility_id = Column(Integer, ForeignKey('facilities.id'))

    # Relationships
    job_jf = Relationship('Jobs', backref='job_facilities')
    facility_fj = Relationship('Facilities', backref='job_facilities')

# job_requirements
class JobRequirements(Base):
    __tablename__ = 'job_requirements'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    requirement_id = Column(Integer, ForeignKey('requirements.id'))
    job_id = Column(Integer, ForeignKey('jobs.id'))

    # Relationships
    requirement_jr = Relationship('Requirements', backref='job_requirements')
    job_jr = Relationship('Jobs', backref='job_requirements')


# Categories
class Categories(Base):
    __tablename__ = 'categories'

    # Columns
    id = Column(Integer, primary_key=True, autoincrement=True)
    category = Column(String(255), nullable=False)

    # UniqueConstraints
    __table_args__ = (UniqueConstraint(category, name='categories_category_UQ'),)

# Job_categories
class JobCategories(Base):
    __tablename__ = 'job_categories'

    # Column
    id = Column(Integer, primary_key=True, autoincrement=True)
    category_id = Column(Integer, ForeignKey('categories.id'))
    job_id = Column(Integer, ForeignKey('jobs.id'))

    # Relationships
    category_jc = Relationship('Categories', backref='job_categories')
    job_jc = Relationship('Jobs', backref='job_categories')

try:
    Base.metadata.create_all(engine)
    print('Tables created successfully')
except Exception as ex:
    print(ex)

Tables created successfully


In [24]:
inspector = inspect(engine)

inspector.get_table_names()

['categories',
 'experience_levels',
 'facilities',
 'job_categories',
 'job_facilities',
 'job_requirements',
 'job_types',
 'jobs',
 'locations',
 'requirements']

In [None]:
# with engine.connect() as conn:
#     conn.execute(text('drop table categories'))
#     conn.commit()

### Insertion des données dans la base

In [25]:
# Create session
session = create_session(engine)

# Tables that don't have foregin keys
try:
    # Locations
    for location in df['Location'].unique():
        new_location = Locations(location=str(location))
        session.add(new_location)

    # Facilities
    for facility in df['Facilities'].explode().unique():
        new_facility = Facilities(facility=str(facility))
        session.add(new_facility)

    # Categories
    for category in df['Job Category'].explode().unique():
        new_category = Categories(category=str(category))
        session.add(new_category)

    # Requirements
    for requirement in df['Requirment of the company'].explode().unique():
        new_requirement = Requirements(requirement=str(requirement))
        session.add(new_requirement)

    # Experience levels
    for experience_level in df['Experience level'].unique():
        new_experience_level = ExperienceLevels(level=str(experience_level))
        session.add(new_experience_level)

    # Job types
    for job_type in df['Job Type'].unique():
        new_job_type = JobTypes(type=str(job_type))
        session.add(new_job_type)

    session.commit()
    print('All rows inserted successfully')
except Exception as e:
    session.rollback()
    print('Error', e)
    print('No row inserted')
finally:
    session.close()

In [None]:
# Create session
session = create_session(engine)

# insert into jobs
with session.begin():
    try:
        for i in range(len(df)):
            title = df['Job Title'][i]
            salary = float(df['Salary'][i])
            salary_has_star = bool(df['Salary_has_star'][i])
            job_type = df['Job Type'][i]
            job_type_id = session.query(JobTypes).filter(JobTypes.type==job_type).first().id
            location = df['Location'][i]
            location_id = session.query(Locations).filter_by(location=location).first().id
            experience_level = df['Experience level'][i]
            experience_level_id = session.query(ExperienceLevels).filter_by(level=experience_level).first().id

            # new jobs
            new_job = Jobs(
                title=title,
                salary=salary,
                salary_has_star=salary_has_star,
                job_type_id=job_type_id,
                location_id=location_id,
                experience_level_id=experience_level_id
            )
            session.add(new_job)

        # Commit the transaction
        session.commit()
        print('All rows inserted successfully')
    except Exception as e:
        # Roll back the transaction
        session.rollback()
        print('Error ',e)
        print('No row inserted')
session.close()

In [26]:
# Create dataframe to insert in the database
df_to_db = df[[
    'Job Title', 
    'Facilities', 
    'Job Category',
    'Requirment of the company'
]].explode('Facilities', ignore_index=True)
df_to_db = df_to_db.explode('Job Category', ignore_index=True)
df_to_db = df_to_db.explode('Requirment of the company', ignore_index=True)
df_to_db.shape

(60309, 4)

In [51]:
# Insert into relations tables
Session = sessionmaker(bind=engine)
session = Session()


for i in range(len(df_to_db)):
    with session.begin():    
        try:
            # Get job_id
            title = df_to_db['Job Title'][i]
            job_id = session.query(Jobs).filter_by(title=title).first().id

            # Get facility id 
            facility = df_to_db['Facilities'][i]
            facility_id = session.query(Facilities).filter_by(facility=facility).first().id

            # Get category id
            category = df_to_db['Job Category'][i]
            category_id = session.query(Categories).filter_by(category=category).first().id

            # Get the requirement id
            requirement = df_to_db['Requirment of the company'][i]
            requirement_id = session.query(Requirements).filter_by(requirement=requirement).first().id

            # Insert row in job facility
            new_job_facility = JobFacilities(job_id=job_id, facility_id=facility_id)
            session.add(new_job_facility)

            # Insert row in job categories
            new_job_category = JobCategories(category_id=category_id, job_id=job_id)
            session.add(new_job_category)

            # Insert row in job requirements
            new_job_requirement = JobRequirements(requirement_id=requirement_id, job_id=job_id)
            session.add(new_job_requirement)
            
            # Commit the transaction
            session.commit()
            print(f"Row {i} inserted successfully", end='\r')
        except Exception as e:
            # Roll back the transaction
            session.rollback()
            print(f"Row {i} has a problem")
            print(e)

In [48]:
session.query(JobRequirements).count()

60309

### Analyse et visualisation

In [None]:

# Import plotly package
import plotly.express as px
