In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

In [2]:
# Load the datasets
data_df = pd.read_csv('data.csv')
filtered_job_titles_df = pd.read_csv('Filtered_Job_Titles_by_Domain.csv')

In [3]:
data_df.head()

Unnamed: 0,ID,Recommenders ID
0,0,"[218, 391]"
1,1,"[412, 869, 233, 289]"
2,2,"[582, 624, 592, 662, 469]"
3,3,"[194, 122]"
4,4,"[763, 726, 589, 977, 950, 543, 30]"


In [4]:
filtered_job_titles_df.head()

Unnamed: 0.1,Unnamed: 0,ID,First Line,Domain,Rank
0,0,0,BILINGUAL LANGUAGE ARTS SIXTH GRADE TEACHER,Teacher,2
1,1,1,MORTGAGE BANKING DEFAULT OPERATIONS SPECIALIST II,Finance,2
2,2,2,GUEST LECTURER,Teacher,2
3,3,3,ACCOUNTANT,Finance,2
4,4,4,STAFF ACCOUNTANT,Finance,2


In [5]:
# Creating the `lor_received` feature
data_df['lor_received'] = data_df['Recommenders ID'].apply(eval).apply(len)

In [6]:
# Creating the `lor_given` feature
lor_given_count = {}
for recommenders_list in data_df['Recommenders ID'].apply(eval):
    for recommender in recommenders_list:
        if recommender in lor_given_count:
            lor_given_count[recommender] += 1
        else:
            lor_given_count[recommender] = 1
data_df['lor_given'] = data_df['ID'].map(lor_given_count).fillna(0).astype(int)

In [7]:
# Merging with Domain
merged_df = pd.merge(data_df, filtered_job_titles_df[['ID', 'Domain']], on='ID', how='left')

In [8]:
# Display the final merged dataframe
merged_df.head()

Unnamed: 0,ID,Recommenders ID,lor_received,lor_given,Domain
0,0,"[218, 391]",2,1,Teacher
1,1,"[412, 869, 233, 289]",4,2,Finance
2,2,"[582, 624, 592, 662, 469]",5,2,Teacher
3,3,"[194, 122]",2,5,Finance
4,4,"[763, 726, 589, 977, 950, 543, 30]",7,1,Finance


In [9]:
domain_mode = merged_df['Domain'].mode()[0]
merged_df['Domain'].fillna(domain_mode, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['Domain'].fillna(domain_mode, inplace=True)


In [10]:
recommender_domain_df = filtered_job_titles_df[['ID', 'Domain']].rename(columns={'ID': 'Recommender_ID', 'Domain': 'Recommender_Domain'})

In [11]:
exploded_data = merged_df[['ID', 'Recommenders ID', 'Domain']].copy()
exploded_data['Recommenders ID'] = exploded_data['Recommenders ID'].apply(eval)
exploded_data = exploded_data.explode('Recommenders ID').rename(columns={'Recommenders ID': 'Recommender_ID'})

In [12]:
merged_recommender_domain = pd.merge(exploded_data, recommender_domain_df, on='Recommender_ID', how='left')

In [13]:
# Defining domain and cross domain recommendation counts
merged_recommender_domain['same_domain'] = merged_recommender_domain['Domain'] == merged_recommender_domain['Recommender_Domain']
domain_specific = merged_recommender_domain.groupby('ID')['same_domain'].sum()
cross_domain = merged_recommender_domain.groupby('ID')['same_domain'].apply(lambda x: (~x).sum())

In [14]:
# Calculating cross domain influence
recommender_given_df = pd.merge(merged_df[['ID', 'lor_given', 'Domain']], recommender_domain_df, left_on='ID', right_on='Recommender_ID', how='left')
recommender_given_df['given_in_same_domain'] = recommender_given_df['Domain'] == recommender_given_df['Recommender_Domain']
domain_influence = recommender_given_df.groupby('ID')['given_in_same_domain'].sum() / recommender_given_df.groupby('ID')['lor_given'].sum()

In [15]:
# Merging the new features 
merged_df['domain_specific'] = merged_df['ID'].map(domain_specific).fillna(0).astype(int)
merged_df['cross_domain'] = merged_df['ID'].map(cross_domain).fillna(0).astype(int)
merged_df['domain_influence'] = merged_df['ID'].map(domain_influence).fillna(0)

In [16]:
merged_df.head()

Unnamed: 0,ID,Recommenders ID,lor_received,lor_given,Domain,domain_specific,cross_domain,domain_influence
0,0,"[218, 391]",2,1,Teacher,1,1,1.0
1,1,"[412, 869, 233, 289]",4,2,Finance,2,2,0.5
2,2,"[582, 624, 592, 662, 469]",5,2,Teacher,0,5,0.5
3,3,"[194, 122]",2,5,Finance,2,0,0.2
4,4,"[763, 726, 589, 977, 950, 543, 30]",7,1,Finance,7,0,1.0


In [17]:
# Define the weights for each component of connectivity influence
weight_domain_specific = 1
weight_cross_domain = 1
weight_domain_influence = 1
weight_lor_received = 1
weight_lor_given = 1

In [18]:
# Calculate the connectivity influence as a weighted sum
merged_df['connectivity_influence'] = (
    (merged_df['domain_specific'] * weight_domain_specific) +
    (merged_df['cross_domain'] * weight_cross_domain) +
    (merged_df['domain_influence'] * weight_domain_influence) +
    (merged_df['lor_received'] * weight_lor_received) +
    (merged_df['lor_given'] * weight_lor_given)
)

In [19]:
merged_df.head()

Unnamed: 0,ID,Recommenders ID,lor_received,lor_given,Domain,domain_specific,cross_domain,domain_influence,connectivity_influence
0,0,"[218, 391]",2,1,Teacher,1,1,1.0,6.0
1,1,"[412, 869, 233, 289]",4,2,Finance,2,2,0.5,10.5
2,2,"[582, 624, 592, 662, 469]",5,2,Teacher,0,5,0.5,12.5
3,3,"[194, 122]",2,5,Finance,2,0,0.2,9.2
4,4,"[763, 726, 589, 977, 950, 543, 30]",7,1,Finance,7,0,1.0,16.0


In [20]:
# Now using decision trees (information gain) to get the updated weights
features = ['domain_specific', 'cross_domain', 'domain_influence', 'lor_received', 'lor_given']
X = merged_df[features]
y = merged_df['connectivity_influence']

In [21]:
# Precuation: Removing any nan or infinite values
X_cleaned = X.replace([np.inf, -np.inf], np.nan).dropna()
y_cleaned = y.loc[X_cleaned.index]

In [22]:
# Initializing and running the model
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_cleaned, y_cleaned)

In [23]:
feature_importances = tree_model.feature_importances_

In [24]:
for feature, importance in zip(features, feature_importances):
    print(f'{feature}: {importance}')

domain_specific: 0.001390313933299589
cross_domain: 1.801368470837867e-05
domain_influence: 0.004749758267238032
lor_received: 0.8332949096292097
lor_given: 0.1605470044855442


In [25]:
# Assigning the new weights derived from the decision trees
weight_domain_specific = feature_importances[0]
weight_cross_domain = feature_importances[1]
weight_domain_influence = feature_importances[2]
weight_lor_received = feature_importances[3]
weight_lor_given = feature_importances[4]

In [26]:
# Re-calculating the connectivity influence as a weighted sum
merged_df['connectivity_influence'] = (
    (merged_df['domain_specific'] * weight_domain_specific) +
    (merged_df['cross_domain'] * weight_cross_domain) +
    (merged_df['domain_influence'] * weight_domain_influence) +
    (merged_df['lor_received'] * weight_lor_received) +
    (merged_df['lor_given'] * weight_lor_given)
)

In [27]:
merged_df.head()

Unnamed: 0,ID,Recommenders ID,lor_received,lor_given,Domain,domain_specific,cross_domain,domain_influence,connectivity_influence
0,0,"[218, 391]",2,1,Teacher,1,1,1.0,1.833295
1,1,"[412, 869, 233, 289]",4,2,Finance,2,2,0.5,3.659465
2,2,"[582, 624, 592, 662, 469]",5,2,Teacher,0,5,0.5,4.490034
3,3,"[194, 122]",2,5,Finance,2,0,0.2,2.473055
4,4,"[763, 726, 589, 977, 950, 543, 30]",7,1,Finance,7,0,1.0,6.008093


In [28]:
infinite_values = merged_df[np.isinf(merged_df['connectivity_influence'])]
infinite_values.head()

Unnamed: 0,ID,Recommenders ID,lor_received,lor_given,Domain,domain_specific,cross_domain,domain_influence,connectivity_influence
43,43,"[675, 367, 602, 536, 229, 131]",6,0,Human Resource (Recruiting Team),5,1,inf,inf
137,140,"[882, 156, 276]",3,0,Finance,3,0,inf,inf
153,157,"[151, 867, 781]",3,0,Supporting People,0,3,inf,inf
177,181,"[52, 726, 205, 137, 802, 887]",6,0,Finance,6,0,inf,inf
185,190,"[557, 532, 666, 343, 602, 691, 93, 526]",8,0,Human Resource (Recruiting Team),6,2,inf,inf


In [29]:
merged_df['connectivity_influence'].replace([np.inf, -np.inf], np.nan, inplace=True)
merged_df.dropna(subset=['connectivity_influence'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['connectivity_influence'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [30]:
min_value = merged_df['connectivity_influence'].min()
max_value = merged_df['connectivity_influence'].max()

In [31]:
merged_df['connectivity_influence_scaled'] = (
    (merged_df['connectivity_influence'] - min_value) / (max_value - min_value)
)

In [32]:
merged_df.head()

Unnamed: 0,ID,Recommenders ID,lor_received,lor_given,Domain,domain_specific,cross_domain,domain_influence,connectivity_influence,connectivity_influence_scaled
0,0,"[218, 391]",2,1,Teacher,1,1,1.0,1.833295,0.084637
1,1,"[412, 869, 233, 289]",4,2,Finance,2,2,0.5,3.659465,0.239201
2,2,"[582, 624, 592, 662, 469]",5,2,Teacher,0,5,0.5,4.490034,0.309499
3,3,"[194, 122]",2,5,Finance,2,0,0.2,2.473055,0.138785
4,4,"[763, 726, 589, 977, 950, 543, 30]",7,1,Finance,7,0,1.0,6.008093,0.437985


In [33]:
# Exporting only the ID and connectivity_influence_scaled in a csv
output_df = merged_df[['ID', 'connectivity_influence_scaled']]

# Saving the dataframe to a CSV file
output_file_path = 'connectivity_influence_scaled.csv'
output_df.to_csv(output_file_path, index=False)