## Importing Packages

In [2]:
import pandas as pd
import requests
import json
import numpy as np
import re

## Function Definition

In [2]:
def req_handler(url, header_input, request_type, payload_input = {}):
    """

    :param url: url of the web page.
    :param header_input headers provided from web page.
    :param request_type: (str) POST or GET
    :param payload_input: the part of that response that is communicating directly to us.
    :return: dictionary.
    """

    try:
        if type(url) is str:
            if request_type == 'GET':
                print("Its a GET request")
                response_get = requests.get(url, headers=header_input, verify=False)
                # verify=False is used so that we can bypass the intrusion
                # SSL HTTS requests.exceptions.SSLError: HTTPSConnectionPool(host='google.com', port=443) does.
                if response_get.status_code == 200:
                    return {'status': 'success', 'body': response_get.json(), 'headers': response_get.headers}
                else:
                    return {'status': 'failure', 'body': 'Unable to connect to API.'}
            elif request_type == 'POST':
                print("Its a POST request")
                if len(payload_input) == 0:
                    return {"status": "failure", "body": "No payload detected."}  # replace with something .
                response_post = requests.post(url, headers=header_input, json=payload_input, verify=False)
                if response_post.status_code == 200:
                    return {"status": "success", "body": response_post.json()}
                else:
                    return {'status': 'failure', 'body': response_post.raise_for_status()}
            else:
                return {'status': 'failure', 'body': 'Cant handle request types except GET and POST.'}
        else:
            return {'status': 'failure', 'body': 'Url was not string type.'}
    except:
        return {'status': 'Error', 'body':{}}

In [3]:
def info_extractor(info, num):
    """

    :param info:
    :param num:
    :return:
    """
    additional_skills = info['body'].get('Additional', [])
    skill_proficiencies = []
    skill = info['body'].get('Skills', [])
    skills = [{"Skill_name": v[0], 'confidence': v[1], 'mandatory_flag': v[2]} for index, v in enumerate(skill)]
    skill_proficiencies.append(skills)
    pskills = info['body'].get('pskills', [])
    ID = "Demand-" + str(num)
    return {'Additional_Skills': additional_skills, 'Skills': skill_proficiencies, 'Pskills': pskills, 'Demand': ID}

In [4]:
def url_cleaner(url):
    """

    :param url: a string consisting of a url.
    :return: the name of the company.
    """
    if type(url) is str:
        if url.startswith("http"):
            cleaner = re.compile(r"https?://(www\.)?")
            url = cleaner.sub("", url).strip().strip("/")
            url = url.split(".")
            return url[0]
        else:
            #ln('error', "the string provided is not a url")
            return np.nan
    else:
        #ln('error', "TypeError:Only strings are allowed")
        return np.nan

## Variable Declaration

In [8]:
data_df = pd.read_csv("C://Users//KIIT//Desktop//clean.csv", encoding='mac_roman')
job_desc = list(data_df['_source.jobDescription'])
Header = {'Content-type': 'application/json'}
Url = "https://edge--non-prod--cda.hirealchemy.com/process"
Url1 = "https://edge--non-prod--cda.hirealchemy.com/v2/fetch_function"
Url2 = "https://edge--non-prod--cda.hirealchemy.com/additional_analysis"

## Fucntion calls

In [7]:
title = list(data_df['_source.companyUrl'])
title_list = [url_cleaner(x) for x in title]

In [1]:
payload = data_df.loc[:, ['_source.jobDescription', '_source.title']].to_dict(orient='split')['data']
ans = {index: req_handler(Url, Header, 'POST', {"title": v[1], 'jd_text': v[0]}) for index, v in enumerate(payload)}

<h2>Brief</h2>
<p>The payload contains all the data of column _source.jobDescription and _source.title converted into dictionary having two keys.</p>
<p>Actaully the payload was divided into 5 subsets for ease of api hits and finding errors.</p>
<p>ans is a dictionary that holds the status, response, and sometimes the payload that was send.</p>

In [2]:
#holder = [req_handler(Url1, Header, 'POST', {"sentences": t}) for t in job_desc]
#extract = [holder[a]['body'] for a in range(0, len(holder))]
# extract is a list of dictionary pairs for each api hit.

#key_changed = [[{"function_name": inner['class'], "confidence": inner['prob']} for inner in outer]for outer in extract]

#holder_2 = [req_handler(Url2, Header, 'POST', {"text": put}) for put in job_desc]
#extracted_domain = [holder_2[x]['body']['extracted_domains'] for x in range(0, len(holder_2))]

<h2>Brief</h2>
<p>The <b>holder</b> variable is a list of things just like <b>ans</b> was a dictionary of status, responses, etc.</p>
<p><b>extract</b> is taking input as value of <b>body key</b> inside the list of dictionaries.</p>
<p><b>holder_2</b> is a list too, doing the same but here the payload is different as related to holder list.</p>
<p><b>extracted_domain</b> take value of key <i>extracted_domain</i> from a nested dictionary.</p>

## Appending Dataframe

In [3]:
Data_df = data_df
Data_df['Extracted_Domains'] = extracted_domain
Data_df['Function_and_Confidence'] = key_changed
####
temp1 = [holder[x]['Additional_Skills'] for x in range(0, len(holder))]
Data_df['Additional_skills'] = temp1
####
temp2 = [holder[x]['Skills'] for x in range(0, len(holder))]
Data_df['skills'] = temp2
####
temp3 = [holder[x]['Pskills'] for x in range(0, len(holder))]
Data_df['pskills'] = temp3
####
temp4 = [holder[x]['Demand'] for x in range(0, len(holder))]
Data_df['Demand_Number'] = temp4
####
Data_df['Normalised_company_names'] = title_list

#Data_df.head()

### Getting excel sheet prepared form Dataframe

In [1]:
#Data_df.to_excel (r'C:\Users\KIIT\Desktop\Changed.xlsx', index = False, header=True)

### New Dataframe

In [1]:
import pandas as pd
Check = pd.read_excel(r"C:\Users\KIIT\Desktop\Changed.xlsx")
Check.head()

Unnamed: 0,_index,_type,_id,_score,_source.activation,_source.companyDescription,_source.companyName,_source.companyUrl,_source.employmentType,_source.experience.value.maxValue,...,_source.title,_source.category,_source.function,_source.industry,Additional_skills,skills,pskills,Demand_Number,Normalised_company_names,Extracted_domains
0,schemanormalized,schemanormalized,naukri_190120001092,0.86549,True,-Redington Gulf FZE is a wholly owned subsidia...,Redington Gulf FZE,,"Full Time,Permanent",3,...,Accounts & Finance Executive (b.com / M.com) -...,,,,"['account payable', 'accounting entries', 'acc...","[[{'Skill_name': 'sales', 'confidence': 98, 'm...",[],Demand-0,,[]
1,schemanormalized,schemanormalized,naukri_070120908798,0.855256,True,,"Public Service Commission, West Bengal",https://www.pscwbonline.gov.in/,"Full Time,Permanent",1,...,Laboratory Assistant - Medical Laboratory Tech...,,,,"['basic computer', 'business administration', ...","[[{'Skill_name': 'assistant', 'confidence': 98...",[],Demand-1,pscwbonline,"['bpo', 'sales']"
2,schemanormalized,schemanormalized,naukri_070120908792,0.855256,True,,"Public Service Commission, West Bengal",https://www.pscwbonline.gov.in/,"Full Time,Permanent",1,...,Laboratory Assistant - Agricultural Engineerin...,,,,"['building services engineering', 'business ad...","[[{'Skill_name': 'assistant', 'confidence': 98...",[],Demand-2,pscwbonline,['hr']
3,schemanormalized,schemanormalized,careesma_05ed13a05d4a31fe6162bbd8ac636693,0.853763,True,,PGIMER,https://www.careesma.in/,,-1,...,Scientist,,,,"['anaglyph image', 'apache config', 'cyberknif...","[[{'Skill_name': '3d', 'confidence': 94, 'mand...",[],Demand-3,careesma,[]
4,schemanormalized,schemanormalized,freshersworld_897771,0.853763,True,Bhabani Prasad Bhattacharya was born in Joydeb...,Police,https://www.freshersworld.com/,FULL_TIME,0,...,jobs for Constable /Assistant Sub Inspector Be...,"Govt Jobs,Defence,RecruitmentHR,Admin","HR,Admin",,"['bsearch', 'candidate', 'e-commerce market de...","[[{'Skill_name': 'assistant', 'confidence': 99...",[],Demand-4,freshersworld,[]
