# Entity Linking Model
In this notebook we train a Learning to Rank model on previously generated Entity Linking data.

In [1]:
%config IPCompleter.greedy=True
import pandas as pd

In [2]:
# data generated from https://github.com/Ryan-Amaral/wikisim/blob/master/wikification/ml-model.ipynb
df = pd.read_csv('el-5000.txt', header=None)
df.columns = ['id', 'trueEntity', 'popScore','ctx1Score',
              'ctx2Score','w2vScore','cohScore','mentionId']
df = df.drop(columns=['id']) # don't need, just internal id for wiki pagews
df.head()

Unnamed: 0,trueEntity,popScore,ctx1Score,ctx2Score,w2vScore,cohScore,mentionId
0,1,0.986376,0.596652,0.909091,0.171256,0.002138,0
1,0,0.010899,0.383337,0.0,0.44541,0.002924,0
2,1,0.992334,0.305474,0.363636,0.392859,0.009689,1
3,0,0.004791,0.137853,0.0,0.260997,0.004032,1
4,0,0.00115,0.164354,0.0,0.349205,0.00021,1


##### Each record is a candidate for a mention.
trueEntity: Whether the candidate (record) is the actual entity wiki page for the mention.

popScore: Score obtained from popularity.

ctx1Score: Score obtained from context1.

ctx2Score: Score obtained from context2.

w2vScore: Score obtained from word2vec.

cohScore: Score obtained from coherence.

mentionId: The mention that this record is referring to.

In [3]:
df.describe() # poorly scaled, I didn't know what I was doing when I made the data.

Unnamed: 0,trueEntity,popScore,ctx1Score,ctx2Score,w2vScore,cohScore,mentionId
count,226797.0,226797.0,226797.0,226797.0,226797.0,226797.0,226797.0
mean,0.11162,0.10923,0.095271,0.086193,0.197487,0.088473,12874.733114
std,0.314899,0.269932,0.177396,0.244223,0.118741,0.219805,7455.759382
min,0.0,1.6e-05,0.0,0.0,-0.039516,0.0,0.0
25%,0.0,0.000892,0.0,0.0,0.142257,0.000101,6342.0
50%,0.0,0.003699,0.065787,0.0,0.215558,0.00286,13103.0
75%,0.0,0.025126,0.105248,0.0,0.27648,0.036394,19357.0
max,1.0,0.999946,0.995435,0.909091,0.70492,1.0,25607.0


In [7]:
# 60-20-20 train test validate split
rows = len(df.index)
y = df.loc[:,'trueEntity'].values
X = df.drop(['trueEntity', 'mentionId'],1).loc[:,:].values
i = df.loc[:,'mentionId'].values
print(X.shape,y.shape, i.shape)
from sklearn.model_selection import train_test_split
XTrain, XBlob, yTrain, yBlob, iTrain, iBlob = train_test_split( X, y, i, test_size=0.4, shuffle=False)
XVali, XTest, yVali, yTest, iVali, iTest = train_test_split( XBlob, yBlob, iBlob, test_size=0.5, shuffle=False)

(226797, 5) (226797,) (226797,)


In [8]:
print(XTrain)
print(yTrain)
print(iTrain)

[[9.86376022e-01 5.96651917e-01 9.09090909e-01 1.71256333e-01
  2.13778277e-03]
 [1.08991826e-02 3.83337305e-01 0.00000000e+00 4.45409828e-01
  2.92353045e-03]
 [9.92334228e-01 3.05474371e-01 3.63636364e-01 3.92858793e-01
  9.68923206e-03]
 ...
 [1.15207373e-03 4.78825274e-01 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [1.15207373e-03 0.00000000e+00 0.00000000e+00 1.49611700e-01
  2.50062866e-05]
 [9.98668442e-01 6.76194596e-01 9.09090909e-01 2.97158625e-01
  7.57711168e-02]]
[1 0 1 ... 0 0 1]
[    0     0     1 ... 15588 15588 15589]


In [9]:
import pyltr

monitor = pyltr.models.monitors.ValidationMonitor(
    XVali, yVali, iVali, metric=pyltr.metrics.NDCG(k=10), stop_after=250)
lmart = pyltr.models.LambdaMART(n_estimators=300, learning_rate=0.1, verbose = 1)
lmart.fit(XTrain, yTrain, iTrain, monitor=monitor)

 Iter  Train score    Remaining                           Monitor Output 
    1       0.8993       43.45m      C:      0.8990 B:      0.8990 S:  0
    2       0.9126       44.49m      C:      0.9134 B:      0.9134 S:  0
    3       0.9246       42.93m      C:      0.9259 B:      0.9259 S:  0
    4       0.9281       42.62m      C:      0.9303 B:      0.9303 S:  0
    5       0.9358       41.38m      C:      0.9389 B:      0.9389 S:  0
    6       0.9366       40.58m      C:      0.9391 B:      0.9391 S:  0
    7       0.9367       40.12m      C:      0.9392 B:      0.9392 S:  0
    8       0.9383       39.94m      C:      0.9414 B:      0.9414 S:  0
    9       0.9385       39.79m      C:      0.9412 B:      0.9414 S:  1
   10       0.9396       39.50m      C:      0.9436 B:      0.9436 S:  0
   15       0.9439       38.21m      C:      0.9477 B:      0.9477 S:  0
   20       0.9476       37.00m      C:      0.9518 B:      0.9518 S:  0
   25       0.9515       36.27m      C:      0.955

<pyltr.models.lambdamart.LambdaMART at 0x7f7fc6c4a320>

In [10]:
# 60-20-20 train test validate split
rows = len(df.index)
y = df.loc[:,'trueEntity'].values
X = df.drop(['trueEntity', 'mentionId','w2vScore'],1).loc[:,:].values
i = df.loc[:,'mentionId'].values
print(X.shape,y.shape, i.shape)
from sklearn.model_selection import train_test_split
XTrain, XBlob, yTrain, yBlob, iTrain, iBlob = train_test_split( X, y, i, test_size=0.4, shuffle=False)
XVali, XTest, yVali, yTest, iVali, iTest = train_test_split( XBlob, yBlob, iBlob, test_size=0.5, shuffle=False)

(226797, 4) (226797,) (226797,)


In [12]:
monitor = pyltr.models.monitors.ValidationMonitor(
    XVali, yVali, iVali, metric=pyltr.metrics.NDCG(k=10), stop_after=250)
lmartNoW2v = pyltr.models.LambdaMART(n_estimators=300, learning_rate=0.1, verbose = 1)
lmartNoW2v.fit(XTrain, yTrain, iTrain, monitor=monitor)

 Iter  Train score    Remaining                           Monitor Output 
    1       0.8993       49.50m      C:      0.8990 B:      0.8990 S:  0
    2       0.9126       49.91m      C:      0.9134 B:      0.9134 S:  0
    3       0.9246       46.07m      C:      0.9259 B:      0.9259 S:  0
    4       0.9281       44.13m      C:      0.9303 B:      0.9303 S:  0
    5       0.9358       42.62m      C:      0.9389 B:      0.9389 S:  0
    6       0.9366       41.76m      C:      0.9391 B:      0.9391 S:  0
    7       0.9367       40.98m      C:      0.9392 B:      0.9392 S:  0
    8       0.9383       40.33m      C:      0.9414 B:      0.9414 S:  0
    9       0.9385       39.82m      C:      0.9412 B:      0.9414 S:  1
   10       0.9396       39.58m      C:      0.9436 B:      0.9436 S:  0
   15       0.9439       38.26m      C:      0.9477 B:      0.9477 S:  0
   20       0.9477       37.68m      C:      0.9518 B:      0.9518 S:  0
   25       0.9515       36.69m      C:      0.955

<pyltr.models.lambdamart.LambdaMART at 0x7f7fc6c4a470>

In [13]:
# save the models in case crash
import pickle
with open('el-lmart.pkl', 'wb') as f:
    pickle.dump(lmart, f)
with open('el-lmart-NoW2v.pkl', 'wb') as f:
    pickle.dump(lmartNoW2v, f)

We suspected word2vec was not working as a good metric for us, so we tested if the model works better without it.

In [18]:
metric = pyltr.metrics.NDCG(k=10)
pred = lmart.predict(XTest)
print('With Word2Vec', metric.calc_mean(iTest,yTest,pred))
pred = lmartNoW2v.predict(XTest)
print('Without Word2Vec', metric.calc_mean(iTest,yTest,pred))

With Word2Vec 0.9435523801489148
Without Word2Vec 0.960043193866277
