# Market Basket Analysis on the [LinkedIn job skills](https://www.kaggle.com/datasets/asaniczka/1-3m-linkedin-jobs-and-skills-2024) dataset

### Author: Adriano Meligrana

In [None]:
# logging in into Kaggle and downloading the dataset

import os

os.environ['KAGGLE_USERNAME'] = "xxxxxx"
os.environ['KAGGLE_KEY'] = "xxxxxx"

!kaggle datasets download -d asaniczka/1-3m-linkedin-jobs-and-skills-2024

!unzip 1-3m-linkedin-jobs-and-skills-2024.zip -d data

In [4]:
# creating the Spark session

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .master('local[*]') \
    .config("spark.driver.memory", "15g") \
    .appName('apriori') \
    .getOrCreate()

In [5]:
# importing the dataset into spark

df = spark.read.csv("./job_skills.csv", header=True, sep=",")
df = df.select(df.columns[1:])

In [6]:
# some preprocessing

from pyspark.sql import Row

def preprocessing(row):
    skills_str = row.job_skills.lower()
    skills_str = skills_str.replace(" capacities", "")
    skills_str = skills_str.replace(" abilities", "")
    skills_str = skills_str.replace(" skills", "")
    skills_str = skills_str.replace("decisionmaking", "decision making")
    skills_str = skills_str.replace("problemsolving", "problem solving")
    skills_str = skills_str.replace("teamwork", "team work")
    skills_str = skills_str.replace("problem-solving", "problem solving")
    skills = list(set(skills_str.split(", ")))
    return skills

def remap_rdd(row):
    row_dict = dict()
    row_dict['job_skills'] = row
    return Row(**row_dict)

df = df.na.drop()
rdd = df.rdd.map(lambda row: preprocessing(row))
rdd = rdd.map(lambda row: remap_rdd(row))
df = rdd.toDF()
df = df.repartition(6)

                                                                                

## Apriori algorithm implementation

In [7]:
import pyspark
import timeit
import gc
import pandas as pd
from operator import add 
from itertools import combinations

"""
apriori(df, basket_col, support_threshold = 0.01, max_frequent = 5, sample_fraction = 0.1, seed = None, cache = True, quiet = False)
  
  df                -> a PySpark DataFrame object.
  basket_col        -> the column of the dataframe containing baskets.
  support_threshold -> the percentage value of the support threshold, e.g. s = 1% -> s_threshold = 0.01.
  max_size          -> the last set size of frequent itemsets to check, e.g. if last_frequent = 2, only singletons and pairs will be checked.
  sample_fraction   -> sample fraction of baskets which will be used to find frequent itemsets.
  seed              -> seed used when sampling.
  cache             -> flag which decides if caching should be applied when running the algorithm.
  quiet             -> flag which decides if some information on the progression will be printed on screen.
"""
def apriori(df, basket_col, support_threshold = 0.01, max_size = 6, sample_fraction = 0.1, seed = None, cache = True, quiet = False):

  if sample_fraction != 1:
      df = df.sample(False, sample_fraction, seed = seed)

  rdd = df.rdd.map(lambda row: row[basket_col])
  rdd_collected = rdd.collect()
  support = int(len(rdd_collected)*support_threshold) + 1
  uniques = {e: i for i, e in enumerate(set(item for basket in rdd_collected for item in basket))}
  uniques_vec = list(uniques.keys())
  rdd = rdd.map(lambda basket: [uniques[item] for item in basket])
      
  frequent_items_dfs = []
  for i in range(max_size):
    if not quiet: print(f"Checking frequent sets of {i+1} elements")
      
    t0 = timeit.default_timer()

    frequent_items_values = None
    gc.collect()
      
    if i != 0:
        rdd = rdd.map(lambda basket: compute_frequents(basket, frequent_items, i))
        rdd = rdd.filter(lambda basket: len(basket) > i)
        if cache: rdd = rdd.persist()

    n_pass = rdd.flatMap(lambda basket: ((items, 1) for items in combinations(basket, i+1))).persist()
    n_pass = n_pass.reduceByKey(add)
    n_pass = n_pass.filter(lambda key_value: key_value[1] >= support)

    frequent_items_values = n_pass.collect()
    frequent_items = set(key_values[0] for key_values in frequent_items_values)
    frequent_items_values = sorted(frequent_items_values, key = lambda key_values: key_values[1], reverse = True)
    frequent_items_values = [(*[uniques_vec[k] for k in kv[0]], kv[1]) for kv in frequent_items_values]
    fdf = pd.DataFrame.from_records(frequent_items_values, columns=[*[f'item_{s+1}' for s in range(i+1)], "frequency"])
    frequent_items_dfs.append(fdf)

    t1 = timeit.default_timer()

    if not quiet: 
        print(f"Step number {i+1} completed in {round(t1-t0, 2)} seconds.")
        print(f"Number of frequent sets of {i+1} elements: {len(frequent_items_values)}\n")
        display(fdf)
    
  return frequent_items_dfs

def compute_frequents(basket, frequent_items, i):
    new_basket = set()
    for items in combinations(basket, i):
        if items in frequent_items:
            for item in items:
                if item not in new_basket:
                    new_basket.add(item)
    return sorted(new_basket)

In [6]:
%%time
# we managed to run the algorithm on all the data with a computer having 16gb of RAM in ~90 seconds
results = apriori(df, "job_skills", support_threshold=0.01, max_size=6, sample_fraction=1, seed=42, cache=True, quiet = False)

                                                                                

Checking frequent sets of 1 elements




Step number 1 completed in 20.34 seconds.
Number of frequent sets of 1 elements: 186



                                                                                

Unnamed: 0,item_1,frequency
0,communication,556216
1,problem solving,316069
2,customer service,290004
3,team work,251963
4,leadership,205542
...,...,...
181,english,13403
182,rn license,13361
183,ethics,13102
184,travel nursing,13004


Checking frequent sets of 2 elements




Step number 2 completed in 16.39 seconds.
Number of frequent sets of 2 elements: 298



                                                                                

Unnamed: 0,item_1,item_2,frequency
0,communication,problem solving,257218
1,communication,team work,200249
2,communication,customer service,195320
3,communication,leadership,151171
4,team work,problem solving,142269
...,...,...,...
293,time management,microsoft office,13029
294,decision making,attention to detail,13005
295,budget management,communication,13004
296,problem solving,planning,12984


Checking frequent sets of 3 elements




Step number 3 completed in 14.27 seconds.
Number of frequent sets of 3 elements: 235



                                                                                

Unnamed: 0,item_1,item_2,item_3,frequency
0,communication,team work,problem solving,128920
1,communication,customer service,problem solving,105967
2,communication,problem solving,leadership,93025
3,communication,customer service,team work,87327
4,communication,time management,problem solving,83301
...,...,...,...,...
230,communication,troubleshooting,problem solving,13031
231,adaptability,team work,leadership,13023
232,communication,project management,analytical,12999
233,communication,patient care,leadership,12956


Checking frequent sets of 4 elements




Step number 4 completed in 12.91 seconds.
Number of frequent sets of 4 elements: 84



                                                                                

Unnamed: 0,item_1,item_2,item_3,item_4,frequency
0,communication,customer service,team work,problem solving,59328
1,communication,team work,problem solving,leadership,50491
2,communication,time management,team work,problem solving,50170
3,communication,team work,problem solving,attention to detail,46984
4,communication,customer service,problem solving,leadership,40666
...,...,...,...,...,...
79,time management,customer service,team work,leadership,13208
80,adaptability,communication,time management,team work,13196
81,communication,team work,problem solving,analytical,13157
82,communication,problem solving,critical thinking,leadership,12966


Checking frequent sets of 5 elements




Step number 5 completed in 5.61 seconds.
Number of frequent sets of 5 elements: 12



                                                                                

Unnamed: 0,item_1,item_2,item_3,item_4,item_5,frequency
0,communication,customer service,team work,problem solving,attention to detail,24399
1,communication,time management,customer service,team work,problem solving,24283
2,communication,time management,team work,problem solving,attention to detail,24177
3,communication,customer service,team work,problem solving,leadership,24096
4,communication,time management,team work,problem solving,leadership,21844
5,communication,time management,customer service,problem solving,attention to detail,18255
6,communication,time management,customer service,problem solving,leadership,17656
7,communication,customer service,team work,problem solving,sales,16568
8,communication,team work,problem solving,attention to detail,leadership,16006
9,communication,time management,customer service,team work,attention to detail,14807


Checking frequent sets of 6 elements
Step number 6 completed in 1.72 seconds.
Number of frequent sets of 6 elements: 1



                                                                                

Unnamed: 0,item_1,item_2,item_3,item_4,item_5,item_6,frequency
0,communication,time management,customer service,team work,problem solving,attention to detail,13192


CPU times: user 17.4 s, sys: 1.95 s, total: 19.3 s
Wall time: 1min 34s


In [9]:
%%time
# and with only a fraction we get a better running time
results = apriori(df, "job_skills", support_threshold=0.01, max_size=6, sample_fraction=0.01, seed=42, cache=True, quiet = True)

                                                                                

CPU times: user 404 ms, sys: 38 ms, total: 442 ms
Wall time: 10.2 s
