In [9]:
import pandas as pd
import numpy as np

# Sample job features (numerical and categorical)
job_features = pd.DataFrame({
    'job_id': np.arange(30),
    'wage': np.random.randint(50000, 150000, size=30),
    'location_closeness': np.random.randint(1, 50, size=30),
    'company_size': np.random.randint(50, 10000, size=30),
    'job_description_similarity': np.random.uniform(0.5, 1.0, size=30),
    'subsector': np.random.choice(['tech', 'finance', 'healthcare', 'education', 'retail'], size=30)
})

# One-hot encoding for categorical features
job_features = pd.get_dummies(job_features, columns=['subsector'])

# Generate 15 unique pairs of jobs for comparisons
np.random.seed(42)
job_ids = np.arange(30)
np.random.shuffle(job_ids)
job_pairs = [(job_ids[i], job_ids[i + 1]) for i in range(0, len(job_ids), 2)][:15]

# Generating random preferences (simulating user preferences)
preferred_jobs = [np.random.choice([pair[0], pair[1]]) for pair in job_pairs]

# Constructing the comparisons DataFrame
comparisons = pd.DataFrame({
    'job_A': [pair[0] for pair in job_pairs],
    'job_B': [pair[1] for pair in job_pairs],
    'preferred': preferred_jobs
})

# Function to create feature difference between job_A and job_B
def create_feature_diff(row, job_features):
    job_A_features = job_features.loc[job_features['job_id'] == row['job_A']].drop('job_id', axis=1).values
    job_B_features = job_features.loc[job_features['job_id'] == row['job_B']].drop('job_id', axis=1).values
    return (job_A_features - job_B_features).flatten()

# Create feature difference data (X) and labels (y)
X = np.array([create_feature_diff(row, job_features) for _, row in comparisons.iterrows()])
y = np.array(comparisons['preferred'] == comparisons['job_A']).astype(int)

# Display the feature differences and labels
print("Feature Differences (X):\n", X)
print("\nLabels (y):\n", y)

Feature Differences (X):
 [[-3885 2 -4212 0.18068642386602574 1 0 0 0 -1]
 [-17327 15 -2204 0.26749735680930775 0 0 0 -1 1]
 [63771 15 -1735 -0.06376317963746192 0 0 0 0 0]
 [69774 24 6139 -0.1820369915018265 0 0 0 0 0]
 [56423 23 5108 -0.04352570591465832 0 0 0 0 0]
 [-59981 33 2290 0.019869693599867677 1 0 0 -1 0]
 [31296 32 1747 0.13295896370053562 0 0 0 0 0]
 [-55370 1 -6125 0.1938140464639907 0 -1 0 1 0]
 [32917 27 3094 0.13669807642679954 0 -1 0 0 1]
 [-65343 -6 -6575 -0.15695375244595444 0 -1 0 1 0]
 [54382 -23 1794 0.18156803352446527 0 0 1 0 -1]
 [-33154 -22 -448 0.09474513162522691 0 1 0 0 -1]
 [-18610 -35 6107 0.18526029887904816 0 1 -1 0 0]
 [71733 20 -1774 0.31373292394935537 0 -1 1 0 0]
 [-15711 -7 8712 0.24814151269066986 1 -1 0 0 0]]

Labels (y):
 [0 0 0 0 0 1 0 0 1 0 1 0 1 0 0]


In [10]:
import xgboost as xgb
##from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the XGBoost model
model = xgb.XGBClassifier()
#model.fit(X_train, y_train)
model.fit(X, y)

# Evaluate the model
accuracy = model.score(X, y)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.93


In [11]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic job data
n_jobs = 100

job_ids = np.arange(n_jobs)
wages = np.random.randint(50000, 150000, size=n_jobs)
location_closeness = np.random.randint(1, 50, size=n_jobs)  # 1 is very close, 50 is far
company_size = np.random.randint(50, 10000, size=n_jobs)  # Number of employees
job_description_similarity = np.random.uniform(0.5, 1.0, size=n_jobs)  # Similarity score between 0.5 and 1.0
subsectors = np.random.choice(['tech', 'finance', 'healthcare', 'education', 'retail'], size=n_jobs)

# Create the DataFrame
jobs = pd.DataFrame({
    'job_id': job_ids,
    'wage': wages,
    'location_closeness': location_closeness,
    'company_size': company_size,
    'job_description_similarity': job_description_similarity,
    'subsector': subsectors
})

# One-hot encode the 'subsector' feature
jobs = pd.get_dummies(jobs, columns=['subsector'])

# Display the first few rows of the dataset
print(jobs.head())

   job_id    wage  location_closeness  company_size  \
0       0   65795                   9          2048   
1       1   50860                  24          8044   
2       2  126820                   1          1545   
3       3  104886                  44          3354   
4       4   56265                   8          3813   

   job_description_similarity  subsector_education  subsector_finance  \
0                    0.612135                False              False   
1                    0.856090                False              False   
2                    0.618625                 True              False   
3                    0.662700                False              False   
4                    0.873246                False              False   

   subsector_healthcare  subsector_retail  subsector_tech  
0                 False              True           False  
1                 False             False            True  
2                 False             False         

In [13]:
# Example: Assuming job_features is a DataFrame with the features of 100 jobs


# Initialize an array to hold scores for each job
scores = np.zeros(len(jobs))

# Calculate preference score for each job
for i, job_A in jobs.iterrows():
    score = 0
    for j, job_B in jobs.iterrows():
        if i != j:
            # Create a feature difference vector
            feature_diff = job_A.drop('job_id').values - job_B.drop('job_id').values
            # Predict preference
            prob = model.predict_proba([feature_diff])[0, 1]
            score += prob
    scores[i] = score

# Get the indices of the top 20 jobs
top_20_indices = np.argsort(scores)[-20:]

# Retrieve the top 20 jobs
top_20_jobs = jobs.iloc[top_20_indices]
print(top_20_jobs)

    job_id    wage  location_closeness  company_size  \
15      15  109735                  41          4787   
90      90   62185                  39          8495   
19      19   55311                   8          7442   
1        1   50860                  24          8044   
71      71  115697                  42          9258   
55      55  130077                   4          8730   
27      27  134478                  35          4981   
48      48   93001                  26          6596   
59      59  119479                  39          6744   
64      64  128953                  36          8170   
42      42  146276                  14          9389   
36      36  116803                  47          7827   
25      25  143016                  24          9975   
63      63  127189                  29          8205   
92      92  136779                   8          5642   
79      79  141387                  30          9924   
51      51  118148                  32          