In [None]:
import numpy as np
import json
from ltr.data import CorpusApi

In [None]:
from os import listdir
from os.path import isfile, join


def getJudgmentsBatchFileByFile():
    """
    Returns a generator function that returns all the judgment batches files from the directory
    """
    files = []
   
    files = [join('./loggedFeatures', file) for file in listdir('./loggedFeatures') if isfile(join('./loggedFeatures', file))]
    for file in files:
        yield file

In [None]:
import csv
judgments = []
# Load all judgments from filesystem
for file in getJudgmentsBatchFileByFile():
  with open(file,'r') as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
      data = []
      for element in row:
        data.append(element.replace(',', ''))
      judgments.append(data)

In [None]:
# Sort the judgments by query id (second element of judgment)

judgments.sort(key = lambda judgments: judgments[1])
print(judgments[0])
print(judgments[1])
print(judgments[2])
print(judgments[3])

In [None]:
# Delete the writen query from the judgments

for j in judgments:
  del j[2]

print(judgments[0])
print(judgments[1])
print(judgments[2])
print(judgments[3])
print(judgments[4])
print(judgments[5])
print(judgments[6])
print(judgments[7])

In [None]:
# Implement min max normalization

def normalizeFeatures(loggedJudgments):
    minimums = [100 for _ in loggedJudgments[0][2:]]
    maximums = [0 for _ in loggedJudgments[0][2:]]
    
    for judgment in loggedJudgments:
        for idx, feature in enumerate(judgment[2:]):
            if minimums[idx] > float(feature):
                minimums[idx] = float(feature)
            
            if maximums[idx] < float(feature):
                maximums[idx] = float(feature)
    
    
        
    normedJudgments = []
    for judgment in loggedJudgments:
        normedFeatures = [0 for _ in judgment[2:]]
        for idx, feature in enumerate(judgment[2:]):
            normedFeatures[idx] = (float(feature) - minimums[idx]) / (maximums[idx] - minimums[idx])
        normedJudgment = judgment[:2]
        normedJudgment.extend(normedFeatures)
        normedJudgments.append(normedJudgment)
    
    return minimums, maximums, normedJudgments

In [None]:
minimums, maximums, normalizedJudgments = normalizeFeatures(judgments)

In [None]:
# Save features to archive for submission 
CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures.csv', normalizedJudgments)

In [None]:
from itertools import groupby

def pairwiseTransformation(normalizedJudgments):
    
    predictorDeltas = []
    featureDeltas = []
    
    for qid, queryJudgments in groupby(normalizedJudgments, key=lambda j: j[1]):
    
        queryJudgmentsCopy1 = list(queryJudgments)
        queryJudgmentsCopy2 = list(queryJudgmentsCopy1)
        
        # Examine every judgment combo for this query, 
        # if they're different, store the pairwise difference:
        # +1 if judgment1 more relevant
        # -1 if judgment2 more relevant
        for judgment1 in queryJudgmentsCopy1:
            
            for judgment2 in queryJudgmentsCopy2:
                
                j1_features=np.array(judgment1[2:])
                j2_features=np.array(judgment2[2:])
                
                if int(judgment1[0]) > int(judgment2[0]):
                    predictorDeltas.append(+1)
                    featureDeltas.append(j1_features-j2_features)
                    
                elif int(judgment1[0]) < int(judgment2[0]):
                    predictorDeltas.append(-1)
                    featureDeltas.append(j1_features-j2_features)
                    
    return np.array(featureDeltas), np.array(predictorDeltas)
        
        

In [None]:
featureDeltas, predictorDeltas = pairwiseTransformation(normalizedJudgments)

In [None]:
from sklearn.model_selection import train_test_split

# Split train data in train and test
featureDeltas_train, featureDeltas_test, predictorDeltas_train, predictorDeltas_test = train_test_split(
    featureDeltas, predictorDeltas, test_size=0.2, shuffle=False)

In [None]:
# Save train test splitted data to archive for submission 

CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures_featureDeltasTrain.csv', featureDeltas_train)
CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures_featureDeltasTest.csv', featureDeltas_test)

CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures_predictorDeltasTrain.csv', predictorDeltas_train)
CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures_predictorDeltasTest.csv', predictorDeltas_test)

In [50]:
from sklearn import svm


# Train the svm model
model = svm.LinearSVC(max_iter=1000000000, verbose=1, C=0.001, penalty="l2", loss='hinge')
model.fit(featureDeltas_train, predictorDeltas_train)


[LibLinear]..*
optimization finished, #iter = 26
Objective value = -24.478290
nSV = 29192


In [51]:
# Model validation

model.score(featureDeltas_test, predictorDeltas_test)

0.9311990686845169

In [52]:
import math
import requests
import json

# Import the model to solr

modelName = 'ranksvm-0.001-hinge'
linearModel = {
  "store": "thesis-ltr",
  "class": "org.apache.solr.ltr.model.LinearModel",
  "name": modelName,
  "features": [
  ],
  "params": {
      "weights": {
      }
  }
}

featureNames = ["title_coveredQueryTerms",
    "headings_coveredQueryTerms",
    "body_coveredQueryTerms",
    "document_coveredQueryTerms",
    "title_coveredQueryTermsRatio",
    "headings_coveredQueryTermsRatio",
    "body_coveredQueryTermsRatio",
    "document_coveredQueryTermsRatio",
    "title_tf",
    "headings_tf",
    "body_tf",
    "document_tf",
    "title_idf",
    "headings_idf",
    "body_idf",
    "document_idf",
    "title_tfidf",
    "headings_tfidf",
    "body_tfidf",
    "document_tfidf",
    "title_bm25",
    "headings_bm25",
    "body_bm25",
    "document_bm25"
]

for idx, featureName in enumerate(featureNames):
    config = {
        "name": featureName,
        "norm": {
            "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
            "params": {
                "min": str(minimums[idx]),
                "max": str(maximums[idx])
            }
        }
    }
    linearModel['features'].append(config)
    linearModel['params']['weights'][featureName] =  model.coef_[0][idx] 

print(json.dumps(linearModel, indent=2))

# Upload the model after deleting the model
print('Delete')
requests.delete(f'http://localhost:8983/solr/thesis-ltr/schema/model-store/{modelName}').json()
print('Add new model')
requests.put(f'http://localhost:8983/solr/thesis-ltr/schema/model-store', json=linearModel).json()

{
  "store": "thesis-ltr",
  "class": "org.apache.solr.ltr.model.LinearModel",
  "name": "ranksvm-0.001-hinge",
  "features": [
    {
      "name": "title_coveredQueryTerms",
      "norm": {
        "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
        "params": {
          "min": "0.0",
          "max": "22.0"
        }
      }
    },
    {
      "name": "headings_coveredQueryTerms",
      "norm": {
        "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
        "params": {
          "min": "0.0",
          "max": "31.0"
        }
      }
    },
    {
      "name": "body_coveredQueryTerms",
      "norm": {
        "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
        "params": {
          "min": "0.0",
          "max": "34.0"
        }
      }
    },
    {
      "name": "document_coveredQueryTerms",
      "norm": {
        "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
        "params": {
          "min": "0.0",
          "max": "34.0"
        }
      }
  

{'responseHeader': {'status': 0, 'QTime': 549}}

In [53]:
# Write the model to archive for submission
with open(f'./submission/ranksvm/solrModels/{modelName}.json', "w") as jsonFile:
    json.dump(linearModel, jsonFile, indent=4)