# Salary and new connection predictions

## Company Emails

The data for this project is a company's email network where each node corresponds to a person at the company, and each edge indicates that at least one email has been sent between two people.

The network also contains the node attributes `Department` and `ManagementSalary`.

`Department` indicates the department in the company which the person belongs to, and `ManagementSalary` indicates whether that person is receiving a management position salary.

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle

# CAREFULL the network is an old one built with networkx 1.x, so you need to install networkx 1.11 to run it
G = nx.read_gpickle('email_prediction.txt')

### Salary Prediction

Using network G, we can predict whether or not individuals are receiving a management position salary.

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

# Convertion of the network into a dataframe
df = pd.DataFrame(index=G.nodes())
df['mana'] = pd.Series(nx.get_node_attributes(G, 'ManagementSalary'))
df['dep'] = pd.Series(nx.get_node_attributes(G, 'Department'))

# Adding new features to improve the model 
df["degree"] = pd.Series(G.degree())
df["degree_cent"] = pd.Series(nx.degree_centrality(G))
df["clustering"] = pd.Series(nx.clustering(G))
    
df_test = df[np.isnan(df["mana"])][["degree", 'dep', "degree_cent", "clustering"]]
df.dropna(inplace=True)

In [3]:
# Disparity test
print(len(df[df["mana"]==0.0]), len(df[df["mana"]==1.0]), len(df))
    
# Balancing the data by undersampling the larger class to twice the size of the smaller one
df_training = pd.concat([df[df["mana"]==1.0], df[df["mana"]==0.0].sample(n=2*len(df[df["mana"]==1.0]), random_state=3)])

634 119 753


In [4]:
# Creation and fitting of the models
X = df_training[["degree", 'dep', "degree_cent", "clustering"]]
y = df_training["mana"]
    
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    
### Decision tree ###
#dt_clf = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
# looking for best params
#grid_values = {"max_depth":[1, 2, 3, 4, 5, None]}
#grid_rdf = GridSearchCV(dt_clf, param_grid = grid_values, scoring = 'roc_auc').fit(X_train, y_train)
#print("Best param :",grid_rdf.best_params_)
# Evaluation
#scores = cross_val_score(dt_clf, X, y, scoring='roc_auc')
#print('Mean ROC AUC dt: %.3f' % mean(scores))
    
### Logistic Regression ###
lr_clf = LogisticRegression(C=5).fit(X_train, y_train)
# looking for best params
grid_values = {"C":[1, 2, 3, 5, 8, 10, 20]}
grid_lr = GridSearchCV(lr_clf, param_grid = grid_values, scoring = 'roc_auc').fit(X_train, y_train)
print("Best param :",grid_lr.best_params_)
# Evaluation
scores = cross_val_score(lr_clf, X, y, scoring='roc_auc')
print('Mean ROC AUC lr: %.3f' % mean(scores))
    
### Random Decision tree ###
#rdfclf = RandomForestClassifier(n_estimators=500, random_state=0).fit(X_train, y_train)
# looking for best params
#grid_values = {"n_estimators":[50, 100, 200, 500]}
#grid_rdf = GridSearchCV(rdfclf, param_grid = grid_values, scoring = 'roc_auc').fit(X_train, y_train)
#print("Best param :",grid_rdf.best_params_)
# Evaluation
#scores = cross_val_score(lr_clf, X, y, scoring='roc_auc')
#print('Mean ROC AUC rdt: %.3f' % mean(scores))
    
### MLP ###
#nnclf = MLPClassifier(hidden_layer_sizes = [10, 10], solver="lbfgs", random_state=0).fit(X_train_scaled, y_train)
# looking for best params
#grid_values = {"hidden_layer_sizes":[[10], [10,5], [10,10]], "solver":["lbfgs", "sgd", "adam"]}
#grid_nn = GridSearchCV(nnclf, param_grid = grid_values, scoring = 'roc_auc').fit(X_train_scaled, y_train)
#print("Best param :",grid_nn.best_params_)
# Evaluation
#scores = cross_val_score(nnclf, X, y, scoring='roc_auc')
#print('Mean ROC AUC nn: %.3f' % mean(scores))

Best param : {'C': 5}
Mean ROC AUC lr: 0.885


In [5]:
# Prediction of the probability of individuals having a management salary
predict_mana_lr = lr_clf.predict_proba(df_test)
    
print(pd.Series(predict_mana_lr[:,1], df_test.index))

1       0.290414
2       0.847866
5       0.994735
8       0.270844
14      0.619467
          ...   
992     0.011725
994     0.012051
996     0.011399
1000    0.096163
1001    0.257491
Length: 252, dtype: float64


### New Connections Prediction

Prediction of future connections between employees of the network. The future connections information has been loaded into the variable `future_connections`. The index is a tuple indicating a pair of nodes that currently do not have a connection, and the `Future Connection` column indicates if an edge between those two nodes will exist in the future, where a value of 1.0 indicates a future connection.

In [6]:
future_connections = pd.read_csv('Future_Connections.csv', index_col=0, converters={0: eval})
future_connections.head(10)

Unnamed: 0,Future Connection
"(6, 840)",0.0
"(4, 197)",0.0
"(620, 979)",0.0
"(519, 872)",0.0
"(382, 423)",0.0
"(97, 226)",1.0
"(349, 905)",0.0
"(429, 860)",0.0
"(309, 989)",0.0
"(468, 880)",0.0


In [7]:
# Preparation of the new features using the jaccard coeff, resource allocation and preferential attachment methods
df = future_connections.copy()
jacc = list(nx.jaccard_coefficient(G))
resource_all = list(nx.resource_allocation_index(G))
pref_atta = list(nx.preferential_attachment(G))

In [8]:
# Merging dataframes
df = future_connections.copy()
df = pd.merge(pd.DataFrame(data=[i[-1] for i in jacc], index=[(i[0],i[1]) for i in jacc], columns=["jaccard"]), df, left_index=True, right_index=True)
df = pd.merge(pd.DataFrame(data=[i[-1] for i in resource_all], index=[(i[0],i[1]) for i in resource_all], columns=["resource_all"]), df, left_index=True, right_index=True)
df = pd.merge(pd.DataFrame(data=[i[-1] for i in pref_atta], index=[(i[0],i[1]) for i in pref_atta], columns=["pref_atta"]), df, left_index=True, right_index=True)
    
df_test = df[np.isnan(df["Future Connection"])][["pref_atta", 'resource_all', "jaccard"]]
    
df.dropna(inplace=True)

In [9]:
# Disparity test
#print(len(df[df["Future Connection"]==0.0]), len(df[df["Future Connection"]==1.0]), len(df))
    
# Balancing the data
df_training = pd.concat([df[df["Future Connection"]==1.0], df[df["Future Connection"]==0.0].sample(n=len(df[df["Future Connection"]==1.0])*2, random_state=3)])
    
X = df_training[["pref_atta", 'resource_all', "jaccard"]]
y = df_training["Future Connection"]
    
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    
### Logistic Regression ###
lr_clf = LogisticRegression(C=1).fit(X_train, y_train)
# looking for best params
#grid_values = {"C":[1, 2, 3, 5, 8, 10, 20]}
#grid_lr = GridSearchCV(lr_clf, param_grid = grid_values, scoring = 'roc_auc').fit(X_train, y_train)
#print("Best param :",grid_lr.best_params_)
# Evaluation
#scores = cross_val_score(lr_clf, X, y, scoring='roc_auc')
#print('Mean ROC AUC lr: %.3f' % mean(scores))

In [10]:
# Prediction of the probability of individuals having new connections
predict_new_links = lr_clf.predict_proba(df_test)
    
print(pd.Series(predict_new_links[:,1], df_test.index))

(0, 9)          0.258917
(0, 19)         0.467299
(0, 20)         0.723175
(0, 35)         0.128700
(0, 38)         0.088970
                  ...   
(998, 999)      0.063564
(1000, 1002)    0.063616
(1000, 1003)    0.063616
(1000, 1004)    0.063616
(1001, 1002)    0.063658
Length: 122112, dtype: float64
