In [14]:
# !pip install texthero

In [2]:
import pandas as pd
from texthero import remove_whitespace
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("fake_job_postings.csv")

In [4]:
from bs4 import BeautifulSoup


def remove_html(html_str: str):
    if isinstance(html_str, str):
        soup = BeautifulSoup(html_str)
        return soup.get_text().replace(u"\xa0", u" ")
    else:
        return html_str


In [5]:
import re


def lower_abbreviated(text: str):

    if isinstance(text, str):
        return " ".join(
            [
                word.lower() if word.isupper() or word.startswith("#") else word
                for word in re.split(" ", text)
            ]
        )
    else:
        return text

In [6]:
def split_on_uppercase(text: str):
    if isinstance(text, str):
        text_list = [s for s in re.split("([A-Z][^A-Z]*)", text) if s]
        return " ".join(text_list)
    else:
        return text

In [7]:
df["requirements"] = (
    df["requirements"]
    .apply(remove_html)
    .apply(lower_abbreviated)
    .apply(split_on_uppercase)
    .fillna("")
)


No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.


 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")



"b'.'" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.



In [8]:
df["company_profile"] = (
    df["company_profile"]
    .apply(remove_html)
    .apply(lower_abbreviated)
    .apply(split_on_uppercase)
    .fillna("")
)


In [9]:
df.requirements = df.requirements.pipe(remove_whitespace)
df.company_profile = df.company_profile.pipe(remove_whitespace)

In [10]:
X, y = df.drop(columns="fraudulent"), df["fraudulent"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [12]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [13]:
train.to_pickle("train_fake_jobs.pkl")
test.to_pickle("test_fake_jobs.pkl")