In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
from scipy import sparse

In [2]:
data = pd.read_csv('data/processed_data.csv')
data.head()

Unnamed: 0,job_id,title,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,...,required_education,industry,function,fraudulent,country_code,city_name,company_profile_tokenized,description_tokenized,requirements_tokenized,benefits_tokenized
0,1,Marketing Intern,Marketing,unknown,We re Food52 and we ve created a groundbreaki...,Food52 a fast growing James Beard Award winn...,Experience with content management systems a m...,unknown,0,1,...,unknown,unknown,Marketing,0,US,New York,"['We', 'Food52', 'created', 'groundbreaking', ...","['Food52', 'fast', 'growing', 'James', 'Beard'...","['Experience', 'content', 'management', 'syste...",[]
1,2,Customer Service - Cloud Video Production,Success,unknown,90 Seconds the worlds Cloud Video Production ...,Organised Focused Vibrant Awesome Do you...,What we expect from you Your key responsibilit...,What you will get from usThrough being part of...,0,1,...,unknown,Marketing and Advertising,Customer Service,0,NZ,Auckland,"['90', 'Seconds', 'worlds', 'Cloud', 'Video', ...","['Organised', 'Focused', 'Vibrant', 'Awesome',...","['What', 'expect', 'Your', 'key', 'responsibil...","['What', 'get', 'usThrough', 'part', '90', 'Se..."
2,3,Commissioning Machinery Assistant (CMA),unknown,unknown,Valor Services provides Workforce Solutions th...,Our client located in Houston is actively se...,Implement pre commissioning and commissioning ...,unknown,0,1,...,unknown,unknown,unknown,0,US,Wever,"['Valor', 'Services', 'provides', 'Workforce',...","['Our', 'client', 'located', 'Houston', 'activ...","['Implement', 'pre', 'commissioning', 'commiss...",[]
3,4,Account Executive - Washington DC,Sales,unknown,Our passion for improving quality of life thro...,THE COMPANY ESRI Environmental Systems Rese...,EDUCATION Bachelor s or Master s in GIS busi...,Our culture is anything but corporate we have ...,0,1,...,Bachelor's Degree,Computer Software,Sales,0,US,Washington,"['Our', 'passion', 'improving', 'quality', 'li...","['THE', 'COMPANY', 'ESRI', 'Environmental', 'S...","['EDUCATION', 'Bachelor', 'Master', 'GIS', 'bu...","['Our', 'culture', 'anything', 'corporate', 'c..."
4,5,Bill Review Manager,unknown,unknown,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE Itemization Review ManagerLOCATION ...,QUALIFICATIONS RN license in the State of Texa...,Full Benefits Offered,0,1,...,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,Fort Worth,"['SpotSource', 'Solutions', 'LLC', 'Global', '...","['JOB', 'TITLE', 'Itemization', 'Review', 'Man...","['QUALIFICATIONS', 'RN', 'license', 'State', '...","['Full', 'Benefits', 'Offered']"


### For EDA, we replaced the null values with 'unknown'. We need to remove features which have large number of 'unknown' values as they will not help our model

In [3]:
data.columns

Index(['job_id', 'title', 'department', 'salary_range', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'country_code', 'city_name', 'company_profile_tokenized',
       'description_tokenized', 'requirements_tokenized',
       'benefits_tokenized'],
      dtype='object')

In [4]:
drop_cols = ['job_id', 'department', 'salary_range', 'company_profile_tokenized', 'description_tokenized',
            'requirements_tokenized', 'benefits_tokenized']

In [5]:
#dropping columns not required
data.drop(drop_cols, axis=1, inplace=True)

In [6]:
data['title'] = data['title'].replace({'unknown': 'title_unknown'})
data['employment_type'] = data['employment_type'].replace({'unknown': 'employment_type_unknown'})
data['required_experience'] = data['required_experience'].replace({'unknown': 'required_experience_unknown'})
data['required_education'] = data['required_education'].replace({'unknown': 'required_education_unknown'})
data['function'] = data['function'].replace({'unknown': 'function_unknown'})
data['country_code'] = data['country_code'].replace({'unknown': 'country_code_unknown'})
data['city_name'] = data['city_name'].replace({'unknown': 'city_name_unknown'})
data['industry'] = data['industry'].replace({'unknown': 'industry_unknown'})

In [7]:
data.head()

Unnamed: 0,title,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country_code,city_name
0,Marketing Intern,We re Food52 and we ve created a groundbreaki...,Food52 a fast growing James Beard Award winn...,Experience with content management systems a m...,unknown,0,1,0,Other,Internship,required_education_unknown,industry_unknown,Marketing,0,US,New York
1,Customer Service - Cloud Video Production,90 Seconds the worlds Cloud Video Production ...,Organised Focused Vibrant Awesome Do you...,What we expect from you Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,required_education_unknown,Marketing and Advertising,Customer Service,0,NZ,Auckland
2,Commissioning Machinery Assistant (CMA),Valor Services provides Workforce Solutions th...,Our client located in Houston is actively se...,Implement pre commissioning and commissioning ...,unknown,0,1,0,employment_type_unknown,required_experience_unknown,required_education_unknown,industry_unknown,function_unknown,0,US,Wever
3,Account Executive - Washington DC,Our passion for improving quality of life thro...,THE COMPANY ESRI Environmental Systems Rese...,EDUCATION Bachelor s or Master s in GIS busi...,Our culture is anything but corporate we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,Washington
4,Bill Review Manager,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE Itemization Review ManagerLOCATION ...,QUALIFICATIONS RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,Fort Worth


In [21]:
#replacing 'unknown' in textual columns with blank space as we have to build vectors
data['company_profile'] = data['company_profile'].replace({'unknown': ''})
data['description'] = data['description'].replace({'unknown': ''})
data['requirements'] = data['requirements'].replace({'unknown': ''})
data['benefits'] = data['benefits'].replace({'unknown': ''})

In [22]:
data.columns

Index(['company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'fraudulent',
       'company_profile_tfidf', 'benefits_tfidf', 'description_tfidf',
       'requirements_tfidf'],
      dtype='object')

In [23]:
categorical_cols = ['title', 'employment_type', 'required_experience', 'required_education', 
                    'industry', 'function', 'country_code', 'city_name']

In [24]:
encoded_data = pd.DataFrame()
for col in categorical_cols:
    col_dummies = pd.get_dummies(data[col])
    encoded_data = pd.concat([encoded_data, col_dummies], axis=1)
    data.drop(col, axis=1, inplace=True)

KeyError: 'title'

In [12]:
encoded_data.head()

Unnamed: 0,Electrician,Environmental Technician I,Piping Material Engineer,"Discipline Manager Civil, Structural, Marine, Architectural",FEA Senior engineer,Manager of Project Management Organization - Engineering,Mechanical Engineering Manager,"Resources Change Management, Process Excellence and Change Enablement Manager",ServiceNow Consultant,AUTOCAD OPERATOR,...,wilmington,woodbridge,woodruff,worcester,İstanbul,Αthens,Αθήνα,ΕΛΛΗΝΙΚΟ,마포구 동교동,city_name_unknown
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
encoded_data.shape

(17880, 13854)

In [14]:
tfidf_vectorizer = TfidfVectorizer()
data['company_profile_tfidf'] = tfidf_vectorizer.fit_transform(data['company_profile'])
data['benefits_tfidf'] = tfidf_vectorizer.fit_transform(data['benefits'])
data['description_tfidf'] = tfidf_vectorizer.fit_transform(data['description'])
data['requirements_tfidf'] = tfidf_vectorizer.fit_transform(data['requirements'])

In [15]:
data.shape

(17880, 12)

In [16]:
encoded_data['company_profile_tfidf'] = data['company_profile_tfidf']
encoded_data['benefits_tfidf'] = data['benefits_tfidf']
encoded_data['description_tfidf'] = data['description_tfidf']
encoded_data['requirements_tfidf'] = data['requirements_tfidf']

In [19]:
encoded_data['company_profile_tfidf'][0]

<17880x14824 sparse matrix of type '<class 'numpy.float64'>'
	with 1141404 stored elements in Compressed Sparse Row format>