In [30]:
import pandas as pd

## Prepare

In [31]:
df = pd.read_csv('../dataset/train_preprocess.csv', encoding='latin-1')

In [32]:
# Extract information from salary
df['average_salary'] = (df['salary_min'] + df['salary_max']) / 2

# Calculate the ratio M/m
df['salary_ratio'] = df['salary_max'] / df['salary_min']

# Classify into Low, Medium, or High based on the specified criteria
df['salary_range'] = pd.cut(df['salary_ratio'],
                            bins=[-float('inf'), 1.25, 1.5, float('inf')],
                            labels=['Low', 'Medium', 'High'],
                            right=False) # Ensure that the intervals are left-closed

boolean_features = {'urgently_hiring', 'dradis_job', 'new_job', 'sponsored', 'featured_employer', 'indeed_applyable', 'company_has_link', 'activity_date_na', 'location_remote', 'no_postal', 'hiring_event_job'}
# Print the count of each salary range
# print(df['salary_range'].value_counts())
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10642 entries, 0 to 10641
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   title                  10642 non-null  object  
 1   company                10642 non-null  object  
 2   rating                 10642 non-null  float64 
 3   review_count           10642 non-null  int64   
 4   urgently_hiring        10642 non-null  bool    
 5   snippet                10642 non-null  object  
 6   dradis_job             10642 non-null  bool    
 7   new_job                10642 non-null  bool    
 8   sponsored              10642 non-null  bool    
 9   featured_employer      10642 non-null  bool    
 10  indeed_applyable       10642 non-null  bool    
 11  source_id              10642 non-null  int64   
 12  job_location_postal    10642 non-null  float64 
 13  salary_min             10642 non-null  int64   
 14  salary_max             10642 non-null 

## Run the block below to do experiments

In [41]:
# Parameters
min_support = 0.1
min_confidence = 0.8

# targets
feature_df = df.select_dtypes(include=['bool', 'category'])
grouped_df = feature_df.groupby('salary_range')
labels = ['Low', 'Medium', 'High']
feature_df.head()

Unnamed: 0,urgently_hiring,dradis_job,new_job,sponsored,featured_employer,indeed_applyable,company_has_link,job_type_Full-time,job_type_Part-time,job_type_Temporary,job_type_Contract,job_type_Internship,job_type_N/A,activity_date_na,location_remote,no_postal,hires_needed_na,hires_needed_exact_na,salary_range
0,False,True,False,True,False,True,False,True,False,False,False,False,False,False,False,False,True,False,Low
1,False,False,False,True,False,True,True,True,False,False,False,False,False,True,False,True,True,True,Medium
2,False,False,False,True,False,False,True,False,False,False,False,False,True,True,True,True,True,True,Medium
3,False,True,True,True,False,True,True,True,False,False,False,False,False,True,False,True,True,False,Medium
4,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,Low


In [43]:
from apriori_python import apriori

def apriori_on_dataframe(df):
    # Prepare feature sets for Apriori
    group_dict = df.to_dict('records')
    print(group)
    print('run with', group_dict[0])
    feature_set_list = []
    for features in group_dict:
        feature_set = []
        for name, value in features.items():
            if name == 'salary_range':
                print('salary:', value)
            if name in boolean_features or name == 'salary_range':
                feature_set.append('_'.join([name, str(value)]))
            elif value:
                feature_set.append(name)
        feature_set_list.append(feature_set)
    freq_feature_set, rules = apriori(feature_set_list, minSup=min_support, minConf=min_confidence)
    return freq_feature_set, rules

results = dict()
# ARM on all data with/without the salary_range
results['All'] = apriori_on_dataframe(feature_df.drop('salary_range', axis=1))
results['All_with_label'] = apriori_on_dataframe(feature_df)

# ARM on each class of salary range
for label in labels:
    group = grouped_df.get_group(label).drop('salary_range', axis=1)
    results[label] = apriori_on_dataframe(group)

run with {'urgently_hiring': False, 'dradis_job': True, 'new_job': False, 'sponsored': True, 'featured_employer': False, 'indeed_applyable': True, 'company_has_link': False, 'job_type_Full-time': True, 'job_type_Part-time': False, 'job_type_Temporary': False, 'job_type_Contract': False, 'job_type_Internship': False, 'job_type_N/A': False, 'activity_date_na': False, 'location_remote': False, 'no_postal': False, 'hires_needed_na': True, 'hires_needed_exact_na': False}
run with {'urgently_hiring': False, 'dradis_job': True, 'new_job': False, 'sponsored': True, 'featured_employer': False, 'indeed_applyable': True, 'company_has_link': False, 'job_type_Full-time': True, 'job_type_Part-time': False, 'job_type_Temporary': False, 'job_type_Contract': False, 'job_type_Internship': False, 'job_type_N/A': False, 'activity_date_na': False, 'location_remote': False, 'no_postal': False, 'hires_needed_na': True, 'hires_needed_exact_na': False}
run with {'urgently_hiring': False, 'dradis_job': True, 'n

In [39]:
import os
from pprint import pp

for label, result in results.items():
    freq_feature_set, rules = result
    
    freq_set_directory = './association/frequent_set'
    rule_directory = './association/rule'
    if not os.path.exists(freq_set_directory):
        os.makedirs(freq_set_directory)
    if not os.path.exists(rule_directory):
        os.makedirs(rule_directory)

    # output result for frequent sets
    file_path = os.path.join(freq_set_directory, label + '.txt')
    with open(file_path, 'w+') as output_file:
        for size, feature_sets in freq_feature_set.items():
            output_file.write(f'Size = {size}\n')
            for feature_set in feature_sets:
                output_file.write(f'....{tuple(feature_set)}\n')
    
    # output result for rules
    rules.sort(key=lambda x: x[2], reverse=True)
    file_path = os.path.join(rule_directory, label + '.txt')
    with open(file_path, 'w+') as output_file:
        for premise, claim, confidence in rules:
            output_file.write(f'{confidence:.3f}: {premise} => {claim}\n')
# pp(sorted(rules, key=lambda x: x[2], reverse=True))