In [None]:
# Install needed libraries
!pip install -U python-jobspy

Collecting python-jobspy
  Downloading python_jobspy-1.1.76-py3-none-any.whl.metadata (10 kB)
Collecting NUMPY==1.26.3 (from python-jobspy)
  Downloading numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting markdownify<0.14.0,>=0.13.1 (from python-jobspy)
  Downloading markdownify-0.13.1-py3-none-any.whl.metadata (8.5 kB)
Collecting tls-client<2.0.0,>=1.0.1 (from python-jobspy)
  Downloading tls_client-1.0.1-py3-none-any.whl.metadata (5.0 kB)
Downloading python_jobspy-1.1.76-py3-none-any.whl (38 kB)
Downloading numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading markdownify-0.13.1-py3-none-any.whl (10 kB)
Downloading tls_client-1.0.1-py3-n

## Post-steps checking proxy-list

In [None]:
# %%capture
# !wget https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt

In [None]:
# with open('http.txt', 'r') as file:
#     lines = file.readlines()
# proxies = [line.strip() for line in lines]

In [None]:
# %%capture
# import requests
# import threading

# working_proxies = []

# def check_proxy(proxy):
#     proxies = {'http': proxy, 'https': proxy}
#     try:
#         response = requests.get('https://google.com', proxies=proxies, timeout=5)
#         if response.status_code == 200:
#             working_proxies.append(proxy)
#     except requests.exceptions.RequestException:
#         print(f'Proxy {proxy} is not working')

# threads = []
# for proxy in proxies:
#     t = threading.Thread(target=check_proxy, args=(proxy,))
#     t.start()
#     threads.append(t)

# for t in threads:
#     t.join()

## Global configuration for scrapping

In [None]:
# Import libraries
import csv
import pandas as pd
import re
from jobspy import scrape_jobs
from tqdm import tqdm
from itertools import product
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import time
import random
import os

# Define search terms and locations
search_terms = [
    "Data Scientist", "Data Analyst", "Cloud Architect", "Cloud Engineer",
    "Software Tester", "Machine Learning Engineer", "Software Engineer", "DevOps Engineer",
    "Stagiaires", "Développeur Informatique", "Java", "Fullstack", "Backend", "Frontend",
    "Back-end", "Front-end", "Pfe", "Internship", "Développeur", "devops", "#pfe"
]

# General locations
locations = [
    "Fez, Fès-Meknès, Morocco", "Casablanca Metropolitan Area",
    "Tanger-Tetouan-Al Hoceima, Morocco", "Rabat, Rabat-Salé-Kénitra, Morocco",
    "Marrakesh, Marrakesh-Safi, Morocco", "Morocco"
]

# Glassdoor-specific locations
glassdoor_locations = [
    "Casablanca, Greater Casablanca", "Fès",
    "Rabat (Morocco)", "Tangier, Tanger-Tétouan (Morocco)", "Marrakech-Tensift-Al Haouz, Morocco"
]

#proxies = working_proxies

results_wanted = 100
hours_old = 24
job_types = ['internship', 'fulltime']

linkedin_fetch_description = False

os.makedirs('./raw_data', exist_ok=True)
os.makedirs('./processed_data', exist_ok=True)

## Source code for scrapping

In [None]:
def check_proxy(proxy):
    try:
        response = requests.get('http://google.com', proxies={
            'http': f'http://{proxy}',
            'https': f'http://{proxy}'
        }, timeout=5)
        print(f"Proxy {proxy} IP: {response.json()['origin']}")
    except Exception as e:
        print(f"Proxy {proxy} failed: {e}")

def scrape_single_job(params):
    site_name = params['site_name']
    search_term = params['search_term']
    location = params['location']
    job_type = params['job_type']
    results_wanted = params['results_wanted']
    hours_old = params['hours_old']
    proxy = params.get('proxy')  # Get the proxy from params
    extra_params = params.get('extra_params', {})

    # Map job_type to jobType column value
    job_type_mapping = {
        'internship': 'Internship',
        'fulltime': 'Full-time'
    }
    job_type_label = job_type_mapping.get(job_type, 'Unknown')

    try:
        time.sleep(random.uniform(1, 3))
        scrape_params = {
            'site_name': site_name,
            'search_term': search_term,
            'location': location,
            'results_wanted': results_wanted,
            'hours_old': hours_old,
            'job_type': job_type,
        }
        if extra_params:
            scrape_params.update(extra_params)

        # Include proxy in scrape_params if provided
        if proxy:
            check_proxy(proxy)
            scrape_params['proxies'] = [proxy]

        jobs = scrape_jobs(**scrape_params)

        if not jobs.empty:
            jobs['jobType'] = job_type_label

        return jobs
    except Exception as e:
        print(f"Error fetching jobs for '{search_term}' in '{location}' on '{site_name}': {e}")
        return pd.DataFrame()  # Return empty DataFrame on error

def scrape_site_jobs_parallel(site_name, search_terms, locations, results_wanted, hours_old, job_types, extra_params=None, proxies=None):
    results_list = []
    combinations = list(product(search_terms, locations, job_types))

    # If proxies are provided, calculate the number of proxies
    if proxies:
        num_proxies = len(proxies)
        print(f"Using {num_proxies} proxies")
    else:
        num_proxies = 0

    # Prepare parameters for each thread
    params_list = []
    for idx, (search_term, location, job_type) in enumerate(combinations):
        params = {
            'site_name': site_name,
            'search_term': search_term,
            'location': location,
            'job_type': job_type,
            'results_wanted': results_wanted,
            'hours_old': hours_old,
            'extra_params': extra_params
        }
        if proxies and num_proxies > 0:
            proxy = proxies[idx % num_proxies]  # Assign proxies in a round-robin fashion
            params['proxy'] = proxy
        params_list.append(params)

    max_workers = 3  # Adjust based on your system's capabilities

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Start the operations and mark each future with its params
        futures = {executor.submit(scrape_single_job, params): params for params in params_list}

        for future in tqdm(as_completed(futures), total=len(futures), desc=f"{site_name.capitalize()} - Progress"):
            result = future.result()
            if not result.empty:
                results_list.append(result)
            if idx < len(params_list) - 1:  # Don't sleep after the last combination
                print(f"Sleeping for 2 minutes after combination {idx + 1} of {len(params_list)}")
                time.sleep(120)  # 2 minutes sleep

    if results_list:
        combined_results = pd.concat(results_list, ignore_index=True)
    else:
        combined_results = pd.DataFrame()
    return combined_results

## Code launch

In [None]:
# LinkedIn Scraping
print("Scraping LinkedIn")
linkedin_results = scrape_site_jobs_parallel(
    site_name='linkedin',
    search_terms=search_terms,
    locations=locations,
    results_wanted=results_wanted,
    hours_old=hours_old,
    job_types=job_types,
    # proxies=proxies,
    extra_params={'linkedin_fetch_description': linkedin_fetch_description}
)

print(f"Found {len(linkedin_results)} jobs on LinkedIn")
linkedin_results.to_csv('./raw_data/jobs_linkedin.csv', quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)

# Glassdoor Scraping
print("Scraping Glassdoor")
glassdoor_results = scrape_site_jobs_parallel(
    site_name='glassdoor',
    search_terms=search_terms,
    locations=glassdoor_locations,
    results_wanted=results_wanted,
    hours_old=hours_old,
    job_types=job_types
    # proxies=proxies
)

print(f"Found {len(glassdoor_results)} jobs on Glassdoor")
glassdoor_results.to_csv('./raw_data/jobs_glassdoor.csv', quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)

# Indeed Scraping
print("Scraping Indeed")
indeed_results = scrape_site_jobs_parallel(
    site_name='indeed',
    search_terms=search_terms,
    locations=locations,
    results_wanted=results_wanted,
    hours_old=hours_old,
    job_types=job_types,
    # proxies=proxies,
    extra_params={'country_indeed': 'Morocco'}
)

print(f"Found {len(indeed_results)} jobs on Indeed")
indeed_results.to_csv('./raw_data/jobs_indeed.csv', quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)

Scraping LinkedIn


Linkedin - Progress:   0%|          | 0/252 [00:00<?, ?it/s]2025-01-20 23:26:26,738 - INFO - JobSpy:LinkedIn - search page: 1 / 10
2025-01-20 23:26:27,066 - INFO - JobSpy:Linkedin - finished scraping
Linkedin - Progress:   0%|          | 1/252 [00:01<07:58,  1.91s/it]2025-01-20 23:26:27,435 - INFO - JobSpy:LinkedIn - search page: 1 / 10
2025-01-20 23:26:27,758 - INFO - JobSpy:Linkedin - finished scraping
Linkedin - Progress:   1%|          | 2/252 [00:02<04:57,  1.19s/it]2025-01-20 23:26:27,771 - INFO - JobSpy:LinkedIn - search page: 1 / 10
2025-01-20 23:26:28,642 - INFO - JobSpy:Linkedin - finished scraping
Linkedin - Progress:   1%|          | 3/252 [00:03<04:21,  1.05s/it]2025-01-20 23:26:29,094 - INFO - JobSpy:LinkedIn - search page: 1 / 10
2025-01-20 23:26:30,337 - INFO - JobSpy:LinkedIn - search page: 1 / 10
2025-01-20 23:26:30,636 - INFO - JobSpy:Linkedin - finished scraping
Linkedin - Progress:   2%|▏         | 4/252 [00:05<05:52,  1.42s/it]2025-01-20 23:26:30,788 - INFO - JobS

Found 350 jobs on LinkedIn
Scraping Glassdoor


Glassdoor - Progress:   0%|          | 0/210 [00:00<?, ?it/s]2025-01-20 23:32:46,095 - INFO - JobSpy:Glassdoor - search page: 1 / 4
2025-01-20 23:32:46,511 - INFO - JobSpy:Glassdoor - finished scraping
Glassdoor - Progress:   0%|          | 1/210 [00:01<06:53,  1.98s/it]2025-01-20 23:32:46,730 - INFO - JobSpy:Glassdoor - search page: 1 / 4
2025-01-20 23:32:47,057 - INFO - JobSpy:Glassdoor - finished scraping
Glassdoor - Progress:   1%|          | 2/210 [00:02<03:56,  1.14s/it]2025-01-20 23:32:47,827 - INFO - JobSpy:Glassdoor - search page: 1 / 4
2025-01-20 23:32:48,136 - INFO - JobSpy:Glassdoor - search page: 1 / 4
2025-01-20 23:32:48,170 - INFO - JobSpy:Glassdoor - finished scraping
Glassdoor - Progress:   1%|▏         | 3/210 [00:03<03:52,  1.13s/it]2025-01-20 23:32:48,570 - INFO - JobSpy:Glassdoor - finished scraping
Glassdoor - Progress:   2%|▏         | 4/210 [00:04<02:52,  1.19it/s]2025-01-20 23:32:50,243 - INFO - JobSpy:Glassdoor - search page: 1 / 4
2025-01-20 23:32:50,785 - IN

Found 46 jobs on Glassdoor
Scraping Indeed


Indeed - Progress:   0%|          | 0/252 [00:00<?, ?it/s]2025-01-20 23:35:45,028 - INFO - JobSpy:Indeed - search page: 1 / 1
2025-01-20 23:35:45,176 - INFO - JobSpy:Indeed - found no jobs on page: 1
2025-01-20 23:35:45,177 - INFO - JobSpy:Indeed - finished scraping
Indeed - Progress:   0%|          | 1/252 [00:01<06:44,  1.61s/it]2025-01-20 23:35:46,192 - INFO - JobSpy:Indeed - search page: 1 / 1
2025-01-20 23:35:46,304 - INFO - JobSpy:Indeed - search page: 1 / 1
2025-01-20 23:35:46,314 - INFO - JobSpy:Indeed - found no jobs on page: 1
2025-01-20 23:35:46,315 - INFO - JobSpy:Indeed - finished scraping
Indeed - Progress:   1%|          | 2/252 [00:02<05:33,  1.33s/it]2025-01-20 23:35:46,318 - INFO - JobSpy:Indeed - search page: 1 / 1
2025-01-20 23:35:46,417 - INFO - JobSpy:Indeed - found no jobs on page: 1
2025-01-20 23:35:46,418 - INFO - JobSpy:Indeed - finished scraping
Indeed - Progress:   1%|          | 3/252 [00:02<03:12,  1.30it/s]2025-01-20 23:35:46,434 - INFO - JobSpy:Indeed - 

Found 176 jobs on Indeed



  combined_results = pd.concat(results_list, ignore_index=True)


# Data Preprocessing

In [None]:
def process_fulltime_csv(input_file, output_file):
    if os.stat(input_file).st_size == 0:
        print(f"File {input_file} is empty. Skipping processing.")
        return
    try:
        df = pd.read_csv(input_file)
        if df.empty:
            print(f"No data to parse from file {input_file}. Skipping.")
            return
    except pd.errors.EmptyDataError:
        print(f"EmptyDataError: No columns to parse from file {input_file}. Skipping.")
        return

    full_time = df[df['jobType'] == 'Full-time']
    full_time = full_time.sort_values(by='date_posted', ascending=False)
    pattern = '|'.join(keywords)
    mask = full_time['title'].str.contains(pattern, case=False, na=False)
    filtered_df = full_time[mask]
    print(output_file + " contains ->" + str(len(filtered_df)) + " offer")
    filtered_df.to_csv('./processed_data/' + str(output_file) + '_filtered_data.csv', index=False)

def process_internship_csv(input_file, output_file):
    if os.stat(input_file).st_size == 0:
        print(f"File {input_file} is empty. Skipping processing.")
        return
    try:
        df = pd.read_csv(input_file)
        if df.empty:
            print(f"No data to parse from file {input_file}. Skipping.")
            return
    except pd.errors.EmptyDataError:
        print(f"EmptyDataError: No columns to parse from file {input_file}. Skipping.")
        return

    internships = df[df['jobType'] == 'Internship']
    internships = internships.sort_values(by='date_posted', ascending=False)
    print(output_file + " contains ->" + str(len(internships)) + " offer")
    internships.to_csv('./processed_data/' + str(output_file) + '_filtered_data_interns.csv', index=False)

In [None]:
keywords = ["Stagiaire", "Développeur", "Informatique", "pfe", "intern", "Developpeur"]

for csv_file in os.listdir('./raw_data'):
    path = os.path.join('./raw_data', csv_file)
    matches = re.findall(r'jobs_(.*?)\.csv', csv_file)
    if matches:
        process_fulltime_csv(path, matches[0])
        process_internship_csv(path, matches[0])

linkedin contains ->79 offer
linkedin contains ->18 offer
glassdoor contains ->10 offer
glassdoor contains ->14 offer
indeed contains ->35 offer
indeed contains ->88 offer


In [None]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.0


In [None]:
csv_directory = './processed_data'

output_excel_file = 'interns_data.xlsx'

csv_files = [file for file in os.listdir(csv_directory) if file.endswith('.csv')]

with pd.ExcelWriter(output_excel_file, engine='xlsxwriter') as writer:
    for csv_file in csv_files:
        csv_path = os.path.join(csv_directory, csv_file)
        df = pd.read_csv(csv_path)
        sheet_name = os.path.splitext(csv_file)[0]
        df.to_excel(writer, sheet_name=sheet_name, index=False)