### Introduction

#### Français
Ce Jupyter Notebook vise à représenter mes propres données que j'ai extraites des sites d'emploi via un processus de web scraping, ces données sont des offres d'emploi provenant essentiellement de LinkedIn, mais à l'avenir, je mettrai en œuvre un scraper pour extraire les offres du site web Indeed, afin d'avoir des données diversifiées. Pour l'instant avec les données que j'ai en ma possession, j'ai représenté sous forme graphique les technologies web les plus couramment utilisées dans les projets web d'entreprise dans les pays de l'UE et certains pays d'Asie.

#### English
This Jupyter Notebook aims to represent my own data that I extracted from jobboards via a web scraping process, this data is job offers coming mainly from LinkedIn, but in the future I will implement a scraper to extract job offers from Indeed website, in order to have diversified data. For now, with the data I have in my possession, I have represented in graphical form the most commonly used web technologies in companies web projects in EU countries and some Asian countries.

### Import libraries

In [None]:
from pandas import Series, DataFrame, read_sql_table
from sqlalchemy import create_engine
import plotly.express as px

### Loading and filtering data

In [None]:
disk_engine = create_engine("{server}+{dialect}://{username}:{password}@{host}:{port}/{database}".format(
    server='mysql',
    dialect='pymysql',
    host='localhost',
    username='root',
    password='password',
    port='3306',
    database='web_scraping_project'
), pool_recycle=14400)

df = read_sql_table('jobs_offers', disk_engine)

# delete unnecessary columns
df.drop(['description', 'company_url', 'date_time', 'criteria'], axis=1, inplace=True)

# set job_offer_id as default index
df.set_index('job_offer_id', inplace=True)

# technologies filter
technologies_filter = df.technologies.apply(lambda d: d != None and len(d) > 0)

# apply technologies filter to df
df = df[technologies_filter]
df.info()

### Utility function

In [None]:
def tech_value_counts(df: DataFrame, normalize: bool = True) -> dict:
    D = {
        "Python Frameworks": [[], 0],
        "PHP Frameworks": [[], 0],
        "JavaScript Frameworks": [[], 0],
        "Main tech": [[], 0],
        "Java Frameworks": [[], 0],
        "Project management": [[], 0],
        "Hosting services": [[], 0],
        "DBMS": [[], 0], "Tests": [[], 0],
        "Other Frameworks": [[], 0],
        "App container": [[], 0],
        "Cloud computing": [[], 0],
        "CMS": [[], 0],
        "Bundlers": [[], 0],
        "Task runners": [[], 0]
    }
    for dictionary in df.technologies:
        for category in D.keys():
            if dictionary.__contains__(category):
                D[category][0].extend(dictionary[category])
                D[category][1] += 1
    return {k: (Series(v[0]).value_counts(normalize=normalize), v[1]) 
            for k, v in D.items() if v[1] >= 100}

def create_bubble_charts(value_counts: dict, threshold: float, show: bool = True):
    """
    This function allows to create bubble charts by category
    """
    # all figures
    F = []
    for category, (counts, total) in value_counts.items():
        data = []
        for technology, value in counts.items():
            if value >= threshold:
                data.append(dict(Technology=technology, Value=value))
        
        bubble_chart = px.scatter(
            data_frame=data,
            x='Technology',
            y='Value',
            text='Technology',
            size='Value',
            title=f"{category} category, number of job offers : {total}",
            hover_name="Technology", width=1000, height=800, size_max=100,
            template='plotly_dark'
        )
        bubble_chart.update_layout(xaxis=dict(visible=False))
        F.append(bubble_chart)
    if show:
        for fig in F:
            fig.show()
    else:
        return F

def create_pie_charts(value_counts: dict, threshold: float, show: bool = True):
    """
    This function allows to create pie charts by category
    """
    # all figures
    F = []
    for category, (counts, total) in value_counts.items():
        data = []
        for technology, value in counts.items():
            if value >= threshold:
                data.append(dict(Technology=technology, Value=value))
        pie_chart = px.pie(
            data_frame=data,
            names='Technology',
            values='Value',
            hover_name="Technology",
            title=f"{category} category, number of job offers : {total}",
            width=1000, height=800, template='plotly_dark'
        )
        F.append(pie_chart)
    if show:
        for fig in F:
            fig.show()
    else:
        return F

def create_bar_charts(value_counts: dict, threshold: float, show: bool = True):
    """
    This function allows to create bar charts by category
    """
    # all figures
    F = []
    for category, (counts, total) in value_counts.items():
        data = []
        for technology, value in counts.items():
            if value >= threshold:
                data.append(dict(Technology=technology, Value=value))
        bar_chart = px.bar(
            data_frame=data,
            x='Technology',
            y='Value',
            hover_name="Technology",
            title=f"{category} category, number of job offers : {total}",
            width=1000, height=800, template='plotly_dark'
        )
        F.append(bar_chart)
    if show:
        for fig in F:
            fig.show()
    else:
        return F

### Data Analysis

In [None]:
value_counts = tech_value_counts(df)
threshold = 0.005
create_pie_charts(value_counts, threshold)
#create_bubble_charts(value_counts, threshold)
#create_bar_charts(value_counts, threshold)

In [None]:
counts_countries_dict = df.country.value_counts().to_dict()
fig_countries_freq = px.bar(
    x=counts_countries_dict.keys(),
    y=counts_countries_dict.values(),
    labels=dict(y='Number of job offers', x='Country'),
    template='plotly_dark',
    title=f"Data volume by country, number of country : {len(counts_countries_dict)}"
)
fig_countries_freq.show()