In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [2]:
# Read data from csv file.
df = pd.read_csv("../input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15185 non-null  object
 8   benefits             10670 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [3]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [4]:
# See the correlation
corr = df.corr().fraudulent
corr

job_id              0.079872
telecommuting       0.034523
has_company_logo   -0.261971
has_questions      -0.091627
fraudulent          1.000000
Name: fraudulent, dtype: float64

In [5]:
# Clean text from the columns of company_profile, description, requirements, benefits and fraudulent
# Put them into a new dataframe for easier manipulation
text_df = df[['company_profile', 'description', 'requirements', 'benefits', 'fraudulent']].copy()
# Check the shape of text_df
text_df.shape

(17880, 5)

In [6]:
# Cleaning the description column first, the easiest since there are no missing values.
def cleanDescrip(desc):
    
    # First remove #URLasna209213..a# if present
    reg = "#URL_[a-z0-9]*#"
    res_str = re.sub(reg," ",desc)
    
    # Replace â€™ with '
    res_str = re.sub(r"â€™","'",res_str)
    
    # Next, remove non-alphanumeric and non-punctuation characters
    reg = "[^a-zA-Z0-9\s:.',-_!?()@;]"
    res_str = re.sub(reg,"",res_str)
    
    # Now, correct the wordWord instances
    reg = "([a-z])([A-Z])"
    res_str = re.sub(reg,r"\1 \2",res_str)
    
    # Add a space where : is present, to ease later cleaning
    res_str = re.sub(":"," : ",res_str)
    
    # Replace multiple spaces with a single space, and remove spaces from ends
    reg = "[\s]+"
    res_str = re.sub(reg," ",res_str).strip()
    
    # Now we can do the usual cleaning
    res_str = clean_text(res_str)
    
    return res_str

In [7]:
# Knowledge from Natural Language Processing (CS6120) course
def clean_text(review):
    """
    Input:
        review: a string containing a review.
    Output:
        review_cleaned: a processed review. 
    """
    stop_words = set(stopwords.words('english'))
    
    #Replace non alphanumeric and non-whitespace characters with empty character
    review_cleaned = re.sub(r'[^\w\s]',"", review)
    #Lowercase the entire review
    review_cleaned = review_cleaned.lower()
    words = nltk.word_tokenize(review_cleaned)
    
    remStop = []
    for word in words:
        if word not in stop_words:
            #Convert review into a list, and then add the non-stopword words to a new list and make that into a string
            remStop.append(word)
        
    review_cleaned = ' '.join(words for words in remStop)
    
    return review_cleaned

In [8]:
# Applying this to entire description column in the df
ls = []
for i in range(text_df.shape[0]):
    try:
        text_df['description'][i] = cleanDescrip(text_df['description'][i]) 
    except Exception as e:
        print(i)
        ls.append(i)
        print(e)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


17513
expected string or bytes-like object


In [9]:
# We can drop those bad rows, since the number of rows is large
for i in ls:
    text_df = text_df.drop(text_df.index[i])

In [10]:
text_df.isnull()

Unnamed: 0,company_profile,description,requirements,benefits,fraudulent
0,False,False,False,True,False
1,False,False,False,False,False
2,False,False,False,True,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
17875,False,False,False,False,False
17876,False,False,False,False,False
17877,False,False,False,True,False
17878,True,False,False,False,False


In [11]:
text_df.fillna("NA",inplace=True)

In [12]:
def cleanCols(comp):
    res_str = cleanDescrip(comp)
    return res_str

In [13]:
for i in range(len(text_df['company_profile'])):
    try:
        text_df['company_profile'][i] = cleanCols(text_df['company_profile'][i])
        text_df['requirements'][i] = cleanCols(text_df['requirements'][i])
        text_df['benefits'][i] = cleanCols(text_df['requirements'][i])
    except Exception as e:
        print(i)
        print(e)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


17513
17513


In [14]:
text_df.head()

Unnamed: 0,company_profile,description,requirements,benefits,fraudulent
0,food52 weve created groundbreaking awardwinnin...,food52 fastgrowing james beard awardwinning on...,experience content management systems major pl...,experience content management systems major pl...,0
1,90 seconds worlds cloud video production servi...,organised focused vibrant awesomedo passion cu...,expect key responsibility communicate client 9...,expect key responsibility communicate client 9...,0
2,valor services provides workforce solutions me...,client located houston actively seeking experi...,implement precommissioning commissioning proce...,implement precommissioning commissioning proce...,0
3,passion improving quality life geography heart...,company esri environmental systems research in...,education bachelors masters gis business admin...,education bachelors masters gis business admin...,0
4,spot source solutions llc global human capital...,job title itemization review manager location ...,qualifications rn license state texas diploma ...,qualifications rn license state texas diploma ...,0


In [15]:
text_df.shape

(17879, 5)

In [16]:
text_df.to_pickle("cleaned.pkl")