In [102]:
import pandas as pd

user_df = pd.read_csv('Data/User/survey_results_public.csv')
job_df = pd.read_csv('Data/job_descriptions.csv')

In [103]:
def preprocess_text(text):
    # Replace ';' and '/' with spaces
    text_processed = text.replace(';', ' ').replace('/', ' ')
    # Convert text to lowercase
    text_processed = text_processed.lower()
    # Remove special characters and digits
    text_processed = ''.join(e for e in text_processed if e.isalnum() or e.isspace())
    # Remove multiple spaces
    text_processed = ' '.join(text_processed.split())
    return text_processed

# User Dataset preprocessing

In [104]:
user_df.columns

Index(['ResponseId', 'Q120', 'MainBranch', 'Age', 'Employment', 'RemoteWork',
       'CodingActivities', 'EdLevel', 'LearnCode', 'LearnCodeOnline',
       'LearnCodeCoursesCert', 'YearsCode', 'YearsCodePro', 'DevType',
       'OrgSize', 'PurchaseInfluence', 'TechList', 'BuyNewTool', 'Country',
       'Currency', 'CompTotal', 'LanguageHaveWorkedWith',
       'LanguageWantToWorkWith', 'DatabaseHaveWorkedWith',
       'DatabaseWantToWorkWith', 'PlatformHaveWorkedWith',
       'PlatformWantToWorkWith', 'WebframeHaveWorkedWith',
       'WebframeWantToWorkWith', 'MiscTechHaveWorkedWith',
       'MiscTechWantToWorkWith', 'ToolsTechHaveWorkedWith',
       'ToolsTechWantToWorkWith', 'NEWCollabToolsHaveWorkedWith',
       'NEWCollabToolsWantToWorkWith', 'OpSysPersonal use',
       'OpSysProfessional use', 'OfficeStackAsyncHaveWorkedWith',
       'OfficeStackAsyncWantToWorkWith', 'OfficeStackSyncHaveWorkedWith',
       'OfficeStackSyncWantToWorkWith', 'AISearchHaveWorkedWith',
       'AISearchWan

In [105]:
# Extract columns which contaisn 'HaveWorkedWith' and 'WantWorkWith' from user_df
HaveWorkedWith = user_df.filter(regex='HaveWorkedWith')
print(HaveWorkedWith.columns)
# Concatenate the columns
HaveWorkedWith = HaveWorkedWith.apply(lambda x: x.str.cat(sep=' '), axis=1)
HaveWorkedWith = HaveWorkedWith.apply(preprocess_text)
HaveWorkedWith

Index(['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
       'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith',
       'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith',
       'NEWCollabToolsHaveWorkedWith', 'OfficeStackAsyncHaveWorkedWith',
       'OfficeStackSyncHaveWorkedWith', 'AISearchHaveWorkedWith',
       'AIDevHaveWorkedWith'],
      dtype='object')


0                                                         
1        html css javascript python supabase amazon web...
2        bash shell all shells go amazon web services a...
3        bash shell all shells html css javascript php ...
4        html css javascript typescript bigquery elasti...
                               ...                        
89179    html css java javascript sql typescript mongod...
89180    dart java python sql postgresql spring boot fl...
89181    assembly bash shell all shells c c python rust...
89182    bash shell all shells c html css java javascri...
89183    c go javascript sql typescript microsoft sql s...
Length: 89184, dtype: object

In [106]:
# Extract columns which contains 'WantToWorkWith'
want_to_work_with = user_df.filter(regex='WantToWorkWith')
print(want_to_work_with.columns)
# Concatenate the columns
want_to_work_with = want_to_work_with.apply(lambda x: x.str.cat(sep=' '), axis=1)
want_to_work_with = want_to_work_with.apply(preprocess_text)
want_to_work_with

Index(['LanguageWantToWorkWith', 'DatabaseWantToWorkWith',
       'PlatformWantToWorkWith', 'WebframeWantToWorkWith',
       'MiscTechWantToWorkWith', 'ToolsTechWantToWorkWith',
       'NEWCollabToolsWantToWorkWith', 'OfficeStackAsyncWantToWorkWith',
       'OfficeStackSyncWantToWorkWith', 'AISearchWantToWorkWith',
       'AIDevWantToWorkWith'],
      dtype='object')


0                                                         
1        bash shell all shells c dart elixir gdscript h...
2        haskell ocaml rust cargo kubernetes nix emacs ...
3        bash shell all shells html css javascript ruby...
4        html css javascript python rust typescript ela...
                               ...                        
89179    bash shell all shells c go html css javascript...
89180    java javascript python sql typescript postgres...
89181    python rust amazon web services aws microsoft ...
89182    bash shell all shells html css javascript powe...
89183    c go javascript sql typescript microsoft sql s...
Length: 89184, dtype: object

In [107]:
# print a random sample of the data
print(want_to_work_with[1])

bash shell all shells c dart elixir gdscript html css javascript rust firebase realtime database supabase flyio netlify render deno elm nuxtjs react svelte vuejs capacitor electron tauri uno platform xamarin godot npm pnpm unity 3d unreal engine vite webpack yarn vim visual studio code github discussions linear notion trello discord signal slack zoom chatgpt neeva ai github copilot


In [108]:
# Merge the two dataframes
user_merge = pd.concat([HaveWorkedWith, want_to_work_with], axis=1)
user_merge.columns = ['HaveWorkedWith', 'WantToWorkWith']
user_merge

Unnamed: 0,HaveWorkedWith,WantToWorkWith
0,,
1,html css javascript python supabase amazon web...,bash shell all shells c dart elixir gdscript h...
2,bash shell all shells go amazon web services a...,haskell ocaml rust cargo kubernetes nix emacs ...
3,bash shell all shells html css javascript php ...,bash shell all shells html css javascript ruby...
4,html css javascript typescript bigquery elasti...,html css javascript python rust typescript ela...
...,...,...
89179,html css java javascript sql typescript mongod...,bash shell all shells c go html css javascript...
89180,dart java python sql postgresql spring boot fl...,java javascript python sql typescript postgres...
89181,assembly bash shell all shells c c python rust...,python rust amazon web services aws microsoft ...
89182,bash shell all shells c html css java javascri...,bash shell all shells html css javascript powe...


## Word2Vec

In [109]:
# !pip install gensim

In [110]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Sample sentences
sentences = ["I love natural language processing", "Word embeddings are interesting"]

# Tokenizing the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Training the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Get the word vector for a given word
word_vector = model.wv['language']

print(word_vector)

[-8.7274825e-03  2.1301615e-03 -8.7354420e-04 -9.3190884e-03
 -9.4281426e-03 -1.4107180e-03  4.4324086e-03  3.7040710e-03
 -6.4986930e-03 -6.8730675e-03 -4.9994122e-03 -2.2868442e-03
 -7.2502876e-03 -9.6033178e-03 -2.7436293e-03 -8.3628409e-03
 -6.0388758e-03 -5.6709289e-03 -2.3441375e-03 -1.7069972e-03
 -8.9569986e-03 -7.3519943e-04  8.1525063e-03  7.6904297e-03
 -7.2061159e-03 -3.6668312e-03  3.1185520e-03 -9.5707225e-03
  1.4764392e-03  6.5244664e-03  5.7464195e-03 -8.7630618e-03
 -4.5171441e-03 -8.1401607e-03  4.5956374e-05  9.2636338e-03
  5.9733056e-03  5.0673080e-03  5.0610625e-03 -3.2429171e-03
  9.5521836e-03 -7.3564244e-03 -7.2703874e-03 -2.2653891e-03
 -7.7856064e-04 -3.2161034e-03 -5.9258583e-04  7.4888230e-03
 -6.9751858e-04 -1.6249407e-03  2.7443992e-03 -8.3591007e-03
  7.8558037e-03  8.5361041e-03 -9.5840869e-03  2.4462664e-03
  9.9049713e-03 -7.6658037e-03 -6.9669187e-03 -7.7365171e-03
  8.3959233e-03 -6.8133592e-04  9.1444086e-03 -8.1582209e-03
  3.7430846e-03  2.63504

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


## Transformer

In [111]:
from transformers import BertTokenizer, BertModel
import torch

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Encode text
input_text = "Here is some text to encode"
inputs = tokenizer(input_text, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)

# The last hidden-state is the first element of the output tuple
last_hidden_states = outputs.last_hidden_state

print(last_hidden_states)

tensor([[[-0.0549,  0.1053, -0.1065,  ..., -0.3551,  0.0686,  0.6506],
         [-0.5759, -0.3650, -0.1383,  ..., -0.6782,  0.2092, -0.1639],
         [-0.1641, -0.5597,  0.0150,  ..., -0.1603, -0.1346,  0.6216],
         ...,
         [ 0.2448,  0.1254,  0.1587,  ..., -0.2749, -0.1163,  0.8809],
         [ 0.0481,  0.4950, -0.2827,  ..., -0.6097, -0.1212,  0.2527],
         [ 0.9046,  0.2137, -0.5897,  ...,  0.3040, -0.6172, -0.1950]]])


# Job Dataset preprocessing

In [112]:
job_df

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615935,134563577088850,0 to 12 Years,B.Tech,$64K-$114K,"Malabo (de jure),",Equatorial Guinea,1.6508,10.2679,Full-Time,18281,...,950-451-5843,Mechanical Engineer,Mechanical Design Engineer,ZipRecruiter,Mechanical Design Engineers create and develop...,"{'Employee Assistance Programs (EAP), Tuition ...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",The Hershey Company,"{""Sector"":""Food and Beverage/Confectionery"",""I..."
1615936,618604818190827,2 to 14 Years,M.Tech,$62K-$130K,Warsaw,Poland,51.9194,19.1451,Intern,63621,...,676.387.1572x71877,IT Manager,IT Director,USAJOBS,An IT Director oversees an organizations IT de...,"{'Health Insurance, Retirement Plans, Paid Tim...",Strategic IT planning Leadership and managemen...,Provide strategic leadership for IT department...,EQT,"{""Sector"":""Energy"",""Industry"":""Energy"",""City"":..."
1615937,615471367712200,4 to 15 Years,BCA,$60K-$96K,Ashgabat,Turkmenistan,38.9697,59.5563,Part-Time,114287,...,537.384.6193x5284,Mechanical Engineer,Mechanical Design Engineer,Indeed,Mechanical Design Engineers create and develop...,"{'Tuition Reimbursement, Stock Options or Equi...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",KLA,"{""Sector"":""Technology"",""Industry"":""Semiconduct..."
1615938,804137342023945,5 to 15 Years,BCA,$65K-$103K,Ouagadougou,Burkina Faso,12.2383,-1.5616,Full-Time,45009,...,(484)257-4755x5346,HR Coordinator,Training Coordinator,Stack Overflow Jobs,Training Coordinators design and implement emp...,"{'Casual Dress Code, Social and Recreational A...",Training program coordination Training materia...,"Coordinate employee training programs, track t...",Mahindra & Mahindra,"{""Sector"":""Automotive"",""Industry"":""Automotive""..."


In [113]:
job_df.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')