### import libraries

In [139]:
import os 
import numpy as np 
from bs4 import BeautifulSoup 
import requests
from selenium import webdriver 
import time
from selenium.webdriver.common.by import By 
import pandas as pd 
import random 
import json 
from collections import defaultdict
import ast

### functions

In [140]:
# manage json 
def write_json(data,fname,fpath=None):
    if fpath:
        loc = fpath + '/' + fname + '.json'
    else: 
        loc = os.getcwd()+"/" + fname + '.json'

    with open(loc, 'w') as json_file:
        json.dump(data, json_file)


def read_json(fname,fpath=None):
    if fpath:
        loc = fpath + '/' + fname + '.json'
    else: 
        loc = os.getcwd()+"/" + fname + '.json'

    with open(loc, 'r') as json_file:
        data = json.load(json_file)
        return data

In [141]:
def generate_html_with_tiles(url_dict, filename=f'{cwd}tiles.html'):
    # Begin writing the HTML content
    html_content = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>URL Tiles</title>
    <style>
        .tile {
            display: inline-block;
            margin: 10px;
            padding: 20px;
            border: 1px solid #ccc;
            border-radius: 8px;
            background-color: #f9f9f9;
            cursor: pointer;
            text-align: center;
            min-width: 100px;
        }
        .clicked {
            background-color: lightgreen;
        }
    </style>
</head>
<body>
    <div id="tiles-container">
'''

    # Add tiles for each URL
    for display_text, url in url_dict.items():
        html_content += f'''
        <div class="tile" onclick="handleClick(this, '{url}')">
            {display_text}
        </div>
'''

    # Add script to handle clicks and storage
    html_content += '''
    </div>
    <script>
        function handleClick(tile, url) {
            let clickedTiles = JSON.parse(localStorage.getItem('clickedTiles')) || [];
            if (clickedTiles.includes(url)) {
                // If already clicked, remove from clickedTiles and remove clicked class
                clickedTiles = clickedTiles.filter(item => item !== url);
                tile.classList.remove('clicked');
            } else {
                // If not clicked, add to clickedTiles and add clicked class
                clickedTiles.push(url);
                tile.classList.add('clicked');
            }
            localStorage.setItem('clickedTiles', JSON.stringify(clickedTiles));
            window.open(url, '_blank');  // Open the URL in a new tab
        }

        // Mark previously clicked tiles
        window.onload = function() {
            let clickedTiles = JSON.parse(localStorage.getItem('clickedTiles')) || [];
            let tiles = document.querySelectorAll('.tile');
            tiles.forEach(tile => {
                let url = tile.getAttribute('onclick').split("handleClick(this, '")[1].slice(0, -2);
                if (clickedTiles.includes(url)) {
                    tile.classList.add('clicked');
                }
            });
        }
    </script>
</body>
</html>'''

    # Write to the specified HTML file
    with open(filename, 'w') as file:
        file.write(html_content)

### basic variables setup

In [142]:
# current directory
cwd = os.getcwd()+"/"


### glassdoor urls

#### glassdoor : url parameters 

In [143]:
glassdoor_urls = {}

# start url 
start = 'https://www.glassdoor.co.in/Job/'

#search term
search_term = {
    'Analytics + India' : 'india-analytics-jobs-SRCH_IL.0,5_IN115_KO6,15.htm',
    'Analyst + India' : 'india-analyst-jobs-SRCH_IL.0,5_IN115_KO6,13.htm',
    'Pyspark + India' : 'india-pyspark-jobs-SRCH_IL.0,5_IN115_KO6,13.htm'
}

#rating 
rating = {
'min rating = 3.5' : 'minRating=3.5'
}

#sort
sort = {
'order by date':'sortBy=date_desc'
}

# company size
company_size = {
'>5000 employees':'employerSizes=5'
}

# role position
seniority = {
    'All Seniority levels':'seniorityType=all'
    # ,
    # 'Mid Senior Level':'seniorityType=midseniorlevel',
    # 'Director':'seniorityType=director',
    # 'Executive':'seniorityType=executive'
    
}

# city 
city = {
    'Remote':'&remoteWorkType=1',
    'Mumbai':'cityId=2851180',
    'Gurgaon':'cityId=2921225',
    'Noida':'cityId=4477468'
}

In [144]:
for search_term_key, search_term_value in search_term.items():
    for rating_key,rating_value in rating.items():
        for company_size_key,company_size_value in company_size.items():
            for seniority_key,seniority_value in seniority.items():
                for city_key,city_value in city.items():
                    url_value = f'{start}{search_term_value}?{rating_value}&{sort['order by date']}&{company_size_value}&{seniority_value}&{city_value}'
                    url_key = f'Glassdoor : {search_term_key} + {rating_key} + {company_size_key} + {seniority_key} + {city_key}'
                    glassdoor_urls[url_key] = url_value
                                
                    

### linkedin url

#### linkedin url parameters

In [145]:
linkedin_urls = {}

#start url 
start = 'https://www.linkedin.com/jobs/search/'

# keyword
keywords = {
    'Analytics and Manager':'keywords=%22Analytics%22%20AND%20%22Manager%22',
    'Business Analyst':'keywords=%22Business%20Analyst%22'
    
}

# sort
sort = {
    'order by date':'sortBy=DD'
}


# geography  -- geoID=
geography = {
    'India':'102713980',
    'Worldwide':'92000000',
    'Bengaluru':'105214831',
    'Gurgaon':'104793846',
    'Delhi':'105282602',
    'Mumbai':'106164952',
    'Noida':'104869687',
    'Gurugram':'106442238'    
}

# combination of geography -- for this one the parameter used in the url is f_PP
geography_combination = {
    'India':'&geoId=102713980',
    'Delhi + Bengaluru + Gurgaon + Noida + Mumbai' : 'f_PP=105214831%2C106164952%2C106442238%2C104793846%2C104869687%2C105282602',
    'Remote - India' : 'f_WT=2&geoId=102713980'
}

In [146]:
for keywords_key, keywords_value in keywords.items():
    for geography_combination_key, geography_combination_value in geography_combination.items():
        url_key = f'linkedin : {keywords_key} + {geography_combination_key}'
        url_value = f'{start}?{sort['order by date']}&{geography_combination_value}&{keywords_value}'
        linkedin_urls[url_key] = url_value

### creating html file

In [147]:
url_dict = {**linkedin_urls, **glassdoor_urls}

generate_html_with_tiles(url_dict)