In [38]:
import pandas as pd

## Prepare

In [40]:
df = pd.read_csv('../dataset/train_preprocess.csv', encoding='latin-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10642 entries, 0 to 10641
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  10642 non-null  object 
 1   company                10642 non-null  object 
 2   rating                 10642 non-null  float64
 3   review_count           10642 non-null  int64  
 4   urgently_hiring        10642 non-null  bool   
 5   snippet                10642 non-null  object 
 6   dradis_job             10642 non-null  bool   
 7   new_job                10642 non-null  bool   
 8   sponsored              10642 non-null  bool   
 9   featured_employer      10642 non-null  bool   
 10  indeed_applyable       10642 non-null  bool   
 11  source_id              10642 non-null  int64  
 12  job_location_postal    10642 non-null  float64
 13  salary_min             10642 non-null  int64  
 14  salary_max             10642 non-null  int64  
 15  co

In [41]:
# Extract information from salary
df['average_salary'] = (df['salary_min'] + df['salary_max']) / 2

# Calculate the ratio M/m
df['salary_ratio'] = df['salary_max'] / df['salary_min']

# Classify into Low, Medium, or High based on the specified criteria
df['salary_range'] = pd.cut(df['salary_ratio'],
                            bins=[-float('inf'), 1.25, 1.5, float('inf')],
                            labels=['Low', 'Medium', 'High'],
                            right=False) # Ensure that the intervals are left-closed


# Print the count of each salary range
# print(df['salary_range'].value_counts())
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10642 entries, 0 to 10641
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   title                  10642 non-null  object  
 1   company                10642 non-null  object  
 2   rating                 10642 non-null  float64 
 3   review_count           10642 non-null  int64   
 4   urgently_hiring        10642 non-null  bool    
 5   snippet                10642 non-null  object  
 6   dradis_job             10642 non-null  bool    
 7   new_job                10642 non-null  bool    
 8   sponsored              10642 non-null  bool    
 9   featured_employer      10642 non-null  bool    
 10  indeed_applyable       10642 non-null  bool    
 11  source_id              10642 non-null  int64   
 12  job_location_postal    10642 non-null  float64 
 13  salary_min             10642 non-null  int64   
 14  salary_max             10642 non-null 

## Run the block below to do experiments

In [107]:
# Parameters
min_support = 0.6
min_confidence = 0.6

# targets
feature_df = df.select_dtypes(include=['bool', 'category'])
grouped_df = feature_df.groupby('salary_range')
labels = ['Low', 'Medium', 'High']

In [108]:
from apriori_python import apriori


results = dict()
for label in labels:
    group = grouped_df.get_group(label).drop('salary_range', axis=1)

    # Prepare feature sets for Apriori
    group_dict = group.to_dict('records')
    feature_set_list = []
    for features in group_dict:
        feature_set = [name for name, value in features.items() if value]
        feature_set_list.append(feature_set)
    
    results[label] = apriori(feature_set_list, minSup=min_support, minConf=min_confidence)


In [110]:
import os
from pprint import pp

for label in labels:
    freq_feature_set, rules = results[label]
    
    freq_set_directory = './association/frequent_set'
    rule_directory = './association/rule'
    if not os.path.exists(freq_set_directory):
        os.makedirs(freq_set_directory)

    # output result for frequent sets
    file_path = os.path.join(freq_set_directory, label + '.txt')
    with open(file_path, 'w+') as output_file:
        for size, feature_sets in freq_feature_set.items():
            output_file.write(f'Size = {size}\n')
            for feature_set in feature_sets:
                output_file.write(f'....{tuple(feature_set)}\n')
    
    # output result for rules
    rules.sort(key=lambda x: x[2], reverse=True)
    file_path = os.path.join(rule_directory, label + '.txt')
    with open(file_path, 'w+') as output_file:
        for premise, claim, confidence in rules:
            output_file.write(f'{confidence:.3f}: {premise} => {claim}\n')
# pp(sorted(rules, key=lambda x: x[2], reverse=True))

FileNotFoundError: [Errno 2] No such file or directory: './association/rule/Low.txt'