## importing libraries

In [3]:
import os 
import numpy as np 
from bs4 import BeautifulSoup 
import requests
from selenium import webdriver 
import time
from selenium.webdriver.common.by import By 
import pandas as pd 
import random 
import json 
from collections import defaultdict
import ast

## functions

In [5]:
# manage json 
def write_json(data,fname,fpath=None):
    if fpath:
        loc = fpath + '/' + fname + '.json'
    else: 
        loc = os.getcwd()+"/" + fname + '.json'

    with open(loc, 'w') as json_file:
        json.dump(data, json_file)


def read_json(fname,fpath=None):
    if fpath:
        loc = fpath + '/' + fname + '.json'
    else: 
        loc = os.getcwd()+"/" + fname + '.json'

    with open(loc, 'r') as json_file:
        data = json.load(json_file)
        return data

In [6]:
# this functions scrolls till the end of the page

def scroll():
    last_height = driver.execute_script("window.scrollTo(0,100);")
    
    while True:
        last_height = driver.execute_script("return window.scrollY")
        time.sleep(1)
    
        driver.execute_script("window.scrollBy(0,window.scrollY);")
        new_height = driver.execute_script("return window.scrollY")
        # print(f'new height = {new_height} , last_height = {last_height}')
    
        if new_height == last_height:
            break


In [7]:
# this function collects data from the tiles and accumulates in the tiles list

def collect_data(tile_class):
    html_content = driver.page_source
    soup = BeautifulSoup(html_content,'lxml')
    tile = soup.find_all('div',attrs={"class":tile_class})
    return tile


In [8]:
# random wait 

def wait():
    wait = random.randint(1,5)
    time.sleep(wait)


In [9]:
# finds the next button on naurki website and clicks it 

def click_next(button_class):
    buttons = driver.find_elements(By.CLASS_NAME,button_class)

    if not buttons:
        return 'stop'

    next_button_found = 0
    for button in buttons:
        # print(button.text.lower())
        if 'next' in button.text.lower():
            next_button_found =1 

            is_disabled = button.get_attribute("disabled")
            if is_disabled:
                return 'stop'
            else:
                driver.execute_script("arguments[0].click()",button)

    if not next_button_found:
        return 'stop'

In [10]:
# convert text file to bs4 element

def file_to_list(file_path):
    with open(file_path, 'r') as file:
        # Read lines and strip newline characters
        lines = [line.strip() for line in file.readlines()]
    return lines

In [11]:
# extract element data and convert it into a dictionary through which the data will be extracted via keys 

def extract_tag_data_to_dict_and_df(ele):
    ele_data = defaultdict(list)

    current_ele = ele
    c = '0'
    while current_ele is not None:
        # Safely extract attributes and set defaults
        c = current_ele.get('class', c) if hasattr(current_ele, 'get') else c
        tag = current_ele.name if hasattr(current_ele, 'name') else '0'
        
        # Create the key tuple
        key = (str(c),str(tag), 'text')

        # Get the text, defaulting to empty string if None
        text = current_ele.text if current_ele.text is not None else ''
        
        # Append the text to the list for the corresponding key
        ele_data[key].append(text)

        # Move to the next element
        current_ele = current_ele.next_element

    # Remove duplicates from the lists 
    for k, v in ele_data.items():
        ele_data[k] = list(dict.fromkeys(v))

    # Create a string representation of the text lists, joined by '--'
    ele_data_str = {k: "--".join(str(item) for item in v if item) for k, v in ele_data.items()}
    
    # Convert the dictionary to a DataFrame
    ele_data_df = pd.DataFrame.from_dict(ele_data_str, orient='index', columns=['text'])

    return ele_data_str, ele_data_df

## initialise parameters

In [12]:
# setting url
urls = read_json('urls')

# current directory
cwd = os.getcwd()+"/"


## open chrome and get jobs tiles from the url 

### get the data - job_1

In [10]:
driver = webdriver.Chrome()

url = 'https://www.naukri.com/'
driver.get(url)
time.sleep(60)

# now you enter you main account credentials.  

tiles_db = {}


for url_id, url in urls.items():
    driver.get(url)
    wait()
    state = 'start'    
    tiles = []
    while state!='stop':
        scroll()
        wait()
        tile = collect_data(tile_class='srp-jobtuple-wrapper')
        if tile:
            tiles = tiles + tile
        wait()
        state = click_next('styles_btn-secondary__2AsIP')
        
    t_count = 0
    for tile in tiles:        
        key = url_id+'--tile_'+str(t_count)
        tiles_db[key] = str(tile)
        t_count+=1

driver.close()

write_json(tiles_db,'job_1')

### create df - job_1

In [19]:
# reading the job html file
job_1 = read_json('job_1')

#converting job elements tags
for tile_id, tile in job_1.items():
    job_1[tile_id] = BeautifulSoup(tile,'lxml').find('div')


In [20]:
df_job_1 = pd.DataFrame(columns=['job_id','company_name','job_title','job_url'])

for tile_id,tile in job_1.items():
    # extract the data from the tiles
    
    job_id = tile.get('data-job-id')
    title = tile.find('a',class_='title').text.strip()
    href = tile.find('a',class_='title')['href']
    company = tile.find('a',class_='comp-name').text.strip()

    # write the data in the dataframe

    data = {
        'job_id' : str(job_id),
        'company_name' : company,
        'job_title' : title,
        'job_url' : href
    }

    df_job_1.loc[len(df_job_1)] = data
  

In [21]:
# creating the a columns showcasing the duplicates in a dataframe

df_job_1['tile_duplicates'] = df_job_1.groupby(df_job_1.columns.tolist()).transform('size')
df_job_1 = df_job_1.drop_duplicates()

In [22]:
# df_job_1.to_excel(cwd+'job.xlsx',index=False)
df_job_1.to_csv(cwd+'job_1.csv',index=False)


## open chrome and get additional information for each jobs

### get the data - job_2

In [24]:
df_job_1 = pd.read_csv(cwd+'job_1.csv')
df_job_1['job_id'] = df_job_1['job_id'].astype(str)

In [25]:
driver = webdriver.Chrome()

url = 'https://www.naukri.com/'
driver.get(url)
time.sleep(60)

# now you enter you dummy account credentials.  

tiles = {}

for index,job_row in df_job_1.iterrows():
    job_id = str(job_row['job_id'])
    job_url = job_row['job_url']
    
    driver.get(job_url)
    wait()
    scroll()
    wait()
    tile = collect_data(tile_class='styles_left-section-container__btAcB')
    if tile:
        tiles[job_id] = str(tile[0])
    # tiles = tiles + [(job_id,tile[0])]
    wait()
        

driver.close()

# writing the tiles result in a file
with open(cwd+'job_2.json', 'w') as json_file:
    json.dump(tiles, json_file)

### create df - job_2

In [26]:
# reading the job html file
with open(cwd + 'job_2.json', 'r') as json_file:
    job_2 = json.load(json_file)

# changing the types of elements(tile) corresponding to job id in json dictionary
for job_id,tile in job_2.items():
    job_2[job_id] = BeautifulSoup(tile,'lxml').find('div')


In [27]:
# # creating a dataframe to find_out columns keys
# tile = job_2['150125006493']
# dict_tile, df_tile = extract_tag_data_to_dict_and_df(tile)
# df_tile.to_csv(cwd+'df_tile.csv')

In [28]:
# dictionary of columns_name and "keys to lookup in dictionary of tile" 

column_find_via = {
    # 'job_title': ("['styles_jhc__jd-top-head__MFoZl']", 'header', 'text'),
    # 'company_name':	("['styles_jd-header-comp-name__MvqAI']", 'a', 'text'),
    'company_rating' : 	("['styles_amb-rating__4UyFL']", 'span', 'text'),
    'job_experience' : 	("['styles_jhc__exp__k_giM']", 'div', 'text'),
    'job_location':	("['styles_jhc__loc___Du2H']", 'div', 'text'),
    'salary': ("['ni-icon-salary']", 'span', 'text'),
    'keywords':	("['styles_chip__7YCfG', 'styles_clickable__dUW8S']", 'span', 'text'),
    'apply_on_company_site': ("['styles_company-site-button__C_2YK', 'company-site-button']", 'button', 'text'),
    'easy_apply':("['styles_apply-button__uJI3A', 'apply-button']", 'button', 'text')
    
}

In [29]:
# creating a dataframe for job_2

df_job_2 = pd.DataFrame(columns=['job_id']+list(column_find_via.keys()))

# populating dataframe for job_2

for job_id,tile in job_2.items():
    data = {
        'job_id':str(job_id)
    }
    
    dict_tile, df_tile = extract_tag_data_to_dict_and_df(tile)
    for col in column_find_via:
        data[col] = dict_tile.get(column_find_via[col])
        
    df_job_2.loc[len(df_job_2)] = data

df_job_2 = df_job_1.merge(df_job_2,on='job_id',how='left')   

# df_job_2.to_excel(cwd+'job.xlsx',index=False)
df_job_2.to_csv(cwd+'job_2.csv',index=False)

In [13]:
# filtering jobs

df_job_3 = pd.read_csv(cwd+'job_2.csv')

## write filtering criteria here

df_job_3.to_csv(cwd+'job_3.csv',index=False)

In [14]:
# read the dataframe

df_job_3 = pd.read_csv(cwd+'job_3.csv')


In [24]:
keyword_terms = ['analyst','analytics','azure','data','visualization','databricks','pyspark']

def calculate_score(keywords):
    score = 0
    if pd.isna(keywords):
        return 1.5
    for k in keyword_terms:
        score += keywords.lower().count(k.lower())
    return score

In [29]:
df_job_3['keyword_score'] = df_job_3['keywords'].apply(calculate_score)
df_job_3 = df_job_3.sort_values(by=['keyword_score', 'company_rating','easy_apply','company_name'], ascending=[False, False,False,True])
df_job_3.to_csv(cwd+'job_3.csv',index=False)

In [46]:
#function to create html

def generate_html_with_tiles(url_dict, filename=f'{cwd}naukri.html'):
    # Begin writing the HTML content
    html_content = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>URL Tiles</title>
    <style>
        .tile {
            display: inline-block;
            margin: 10px;
            padding: 20px;
            border: 1px solid #ccc;
            border-radius: 8px;
            background-color: #f9f9f9;
            cursor: pointer;
            text-align: center;
            min-width: 100px;
        }
        .clicked {
            background-color: lightgreen;
        }
    </style>
</head>
<body>
    <div id="tiles-container">
'''

    # Add tiles for each URL
    for display_text, url in url_dict.items():
        html_content += f'''
        <div class="tile" onclick="handleClick(this, '{url}')">
            {display_text}
        </div>
'''

    # Add script to handle clicks and storage
    html_content += '''
    </div>
    <script>
        function handleClick(tile, url) {
            let clickedTiles = JSON.parse(localStorage.getItem('clickedTiles')) || [];
            if (clickedTiles.includes(url)) {
                // If already clicked, remove from clickedTiles and remove clicked class
                clickedTiles = clickedTiles.filter(item => item !== url);
                tile.classList.remove('clicked');
            } else {
                // If not clicked, add to clickedTiles and add clicked class
                clickedTiles.push(url);
                tile.classList.add('clicked');
            }
            localStorage.setItem('clickedTiles', JSON.stringify(clickedTiles));
            window.open(url, '_blank');  // Open the URL in a new tab
        }

        // Mark previously clicked tiles
        window.onload = function() {
            let clickedTiles = JSON.parse(localStorage.getItem('clickedTiles')) || [];
            let tiles = document.querySelectorAll('.tile');
            tiles.forEach(tile => {
                let url = tile.getAttribute('onclick').split("handleClick(this, '")[1].slice(0, -2);
                if (clickedTiles.includes(url)) {
                    tile.classList.add('clicked');
                }
            });
        }
    </script>
</body>
</html>'''

    # Write to the specified HTML file
    with open(filename, 'w') as file:
        file.write(html_content)

In [56]:
df_job_3['display_text'] = df_job_3['job_title'].str.cat(df_job_3['company_name'], sep=' | ',na_rep='NA').str.cat(df_job_3['company_rating'].astype(str), sep=' | ',na_rep = 'NA').str.cat(df_job_3['job_location'],sep=' | ',na_rep = 'NA')

job_3 = {}

In [61]:
for i,row in df_job_3.iterrows():
    job_3[row['display_text']] = row['job_url']

In [62]:
generate_html_with_tiles(job_3)