In [10]:
def import_csv():
    applications = pd.read_csv("data/applications.csv")
    jobs = pd.read_csv("data/jobs.csv")
    test_users = pd.read_csv("data/test_users.csv")
    users = pd.read_csv("data/users.csv")
    users_history = pd.read_csv("data/users_history.csv")
    return applications,jobs,test_users,users,users_history

In [4]:
def df_history_list(users_history):
    users_history = users_history[users_history["JobTitle"].isna()==False]
    users_history["JobTitle"] = users_history["JobTitle"].apply(lambda x:x.lower())

    list_users_history = [[user_id,users_history[users_history["UserID"]==user_id].JobTitle.tolist()] for user_id in users_history.UserID.unique().tolist()]

    df_users_history_list = pd.DataFrame(list_users_history)
    df_users_history_list.columns=["UserID","users_history"]
    return df_users_history_list

In [1]:
def clean_html(df,feature):
    return df[feature].apply(
        lambda x:BeautifulSoup(x, "lxml").text).apply(
        lambda x:x.replace("\\n","")).apply(
        lambda x:x.replace("\\r","")).apply(
        lambda x:x.replace("\\t","")).apply(
        lambda x:x.replace("\xa0","")).apply(
        lambda x:x.replace("\u200b",""))


In [3]:
def convert_datetime(df):
    return df.apply(lambda x:pd.to_datetime(x,infer_datetime_format=True))

In [4]:
def df_create(users,users_history,applications,jobs):
    applications_descriptions = applications.merge(jobs,on="JobID")
    applications_descriptions.columns = ['UserID', 'ApplicationDate', 'JobID', 'Title', 'Description',
                                            'Requirements', 'Job_City', 'Job_State', 'Job_Country', 
                                             'Zip5', 'StartDate','EndDate']


    df_users_applications = applications_descriptions.merge(users,on="UserID")
    return df_users_applications.merge(users_history,on="UserID")


In [5]:
def df_create_concat_history(users,users_history,applications,jobs):
    df_users_history = df_history_list(users_history)
    
    applications_descriptions = applications.merge(jobs,on="JobID")
    applications_descriptions.columns = ['UserID', 'ApplicationDate', 'JobID', 'Title', 'Description',
                                            'Requirements', 'Job_City', 'Job_State', 'Job_Country', 
                                             'Zip5', 'StartDate','EndDate']


    df_users_applications = applications_descriptions.merge(users,on="UserID")
    return df_users_applications.merge(df_users_history,on="UserID")

In [12]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [14]:
def test_preprocessing():
    predict_list = test_users.merge(users_history,on="UserID")[["UserID","JobTitle"]].drop_duplicates()
    predict_list = predict_list[predict_list["JobTitle"].isna()==False]
    predict_list.reset_index(drop=True,inplace=True)

    predict_list["text_embedding"] = predict_list.JobTitle.apply(lambda x:model.encode(x))
    return predict_list

In [16]:
def predict_job(predict_list,jobs):
    predict_id = []
    predict_job = []
    similarity = []

    for test_embedding in predict_list.text_embedding.tolist():
        df_similarity = jobs.copy()
        df_similarity["similarity"] = df_similarity.text_embedding.apply(
            lambda x:cosine(x,test_embedding))
        df_similarity = df_similarity.sort_values(by=["similarity"],ascending=False)
        predict_id.append(df_similarity.JobID[:3].tolist())
        predict_job.append(df_similarity.Title[:3].tolist())
        similarity.append(df_similarity.similarity[:3].tolist())
        
    predict_list["predict_id"] = predict_id
    predict_list["predict_job"] = predict_job
    predict_list["similarity"] = similarity
    
    return predict_list

In [3]:
def create_df_job(jobs,applications,users_history,users):

    application_list = jobs.merge(applications,on="JobID")[["UserID","Title"]]
    application_list.columns=["UserID","JobTitle"]
    history_list = users_history.merge(users,on="UserID")[["UserID","JobTitle"]]

    df_job = pd.concat([application_list,history_list])
    df_job = df_job[df_job["JobTitle"].isna()==False]

    df_job.JobTitle = df_job.JobTitle.apply(lambda x:x.lower())

    df_job = pd.DataFrame(df_job.groupby(["UserID","JobTitle"]).size(),columns=["NB"]).reset_index()

    df_job.columns=["UserID","JobTitle","NB"]
    return df_job

In [13]:
def df_unique_job_id(users_history,jobs):    
    #df_job_unique = pd.DataFrame(jobs.Title.tolist(),columns=["JobTitle"])
    df_job_unique = pd.DataFrame((users_history.JobTitle.tolist()+jobs.Title.tolist()),columns=["JobTitle"])
    df_job_unique = df_job_unique[df_job_unique["JobTitle"].isna()==False]
    df_job_unique.JobTitle = df_job_unique.JobTitle.apply(lambda x:x.lower())
    df_job_unique.drop_duplicates(subset=["JobTitle"],inplace=True)
    df_job_unique = df_job_unique.reset_index(drop=True)
    df_job_unique = df_job_unique.reset_index()

    df_job_unique.columns=["jobID","JobTitle"]
    return df_job_unique

In [5]:
def df_create_test(test_users,users_history,df_job_unique):
    df_test = test_users.merge(users_history,on="UserID")[["UserID","JobTitle"]]
    df_test = df_test[df_test["JobTitle"].isna()==False]
    df_test.JobTitle = df_test.JobTitle.apply(lambda x:x.lower())

    df_test = pd.DataFrame(df_test.groupby(["UserID","JobTitle"]).size(),columns=["NB"]).reset_index()
    df_test = df_test.merge(df_job_unique,on="JobTitle")
    return df_test