In [15]:
import numpy as np
import json
from ltr.data import CorpusApi

In [17]:
from os import listdir
from os.path import isfile, join
def getJudgmentsBatchFileByFile():
    """
    Returns a generator function that returns all the judgment batches files from the directory
    """
    files = []
   
    files = [join('./loggedFeatures', file) for file in listdir('./loggedFeatures') if isfile(join('./loggedFeatures', file))]
    for file in files:
        yield file

In [18]:
import csv
judgments = []
for file in getJudgmentsBatchFileByFile():
  with open(file,'r') as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
      data = []
      for element in row:
        data.append(element.replace(',', ''))
      judgments.append(data)

In [19]:
judgments.sort(key = lambda judgments: judgments[1])
print(judgments[0])
print(judgments[1])
print(judgments[2])
print(judgments[3])

['0', '10000', 'msmarco_doc_05_72507775', '1.0', '2.0', '8.0', '8.0', '0.083333336', '0.16666667', '0.6666667', '0.6666667', '1.0', '2.4142137', '36.53677', '36.764153', '90.677284', '72.23833', '41.410515', '41.410515', '7.282527', '10.645987', '84.99544', '85.76555', '1.655931', '1.747792', '5.9285426', '6.021519']
['1', '10000', 'msmarco_doc_10_1691063043', '1.0', '8.0', '9.0', '9.0', '0.083333336', '0.6666667', '0.75', '0.75', '1.0', '11.69213', '55.452248', '56.990223', '90.677284', '72.23833', '41.410515', '41.410515', '7.7639008', '64.12446', '153.44191', '158.3429', '3.4576335', '13.961745', '14.04283', '14.179642']
['1', '1000005', 'msmarco_doc_19_673141443', '2.0', '2.0', '4.0', '4.0', '0.5', '0.5', '1.0', '1.0', '2.0', '2.828427', '18.934044', '19.41233', '27.758165', '22.795256', '13.165058', '13.165058', '17.45566', '22.153984', '77.66603', '79.93644', '9.476915', '9.96361', '7.746788', '7.7889223']
['0', '1000005', 'msmarco_doc_19_673231526', '1.0', '1.0', '2.0', '2.0', '

In [20]:
for j in judgments:
  del j[2]

print(judgments[0])
print(judgments[1])
print(judgments[2])
print(judgments[3])
print(judgments[4])
print(judgments[5])
print(judgments[6])
print(judgments[7])

['0', '10000', '1.0', '2.0', '8.0', '8.0', '0.083333336', '0.16666667', '0.6666667', '0.6666667', '1.0', '2.4142137', '36.53677', '36.764153', '90.677284', '72.23833', '41.410515', '41.410515', '7.282527', '10.645987', '84.99544', '85.76555', '1.655931', '1.747792', '5.9285426', '6.021519']
['1', '10000', '1.0', '8.0', '9.0', '9.0', '0.083333336', '0.6666667', '0.75', '0.75', '1.0', '11.69213', '55.452248', '56.990223', '90.677284', '72.23833', '41.410515', '41.410515', '7.7639008', '64.12446', '153.44191', '158.3429', '3.4576335', '13.961745', '14.04283', '14.179642']
['1', '1000005', '2.0', '2.0', '4.0', '4.0', '0.5', '0.5', '1.0', '1.0', '2.0', '2.828427', '18.934044', '19.41233', '27.758165', '22.795256', '13.165058', '13.165058', '17.45566', '22.153984', '77.66603', '79.93644', '9.476915', '9.96361', '7.746788', '7.7889223']
['0', '1000005', '1.0', '1.0', '2.0', '2.0', '0.25', '0.25', '0.5', '0.5', '1.0', '1.4142135', '8.70179', '8.973479', '27.758165', '22.795256', '13.165058', '

In [21]:
def normalizeFeatures(loggedJudgments):
    minimums = [100 for _ in loggedJudgments[0][2:]]
    maximums = [0 for _ in loggedJudgments[0][2:]]
    
    for judgment in loggedJudgments:
        for idx, feature in enumerate(judgment[2:]):
            if minimums[idx] > float(feature):
                minimums[idx] = float(feature)
            
            if maximums[idx] < float(feature):
                maximums[idx] = float(feature)
    
    
        
    normedJudgments = []
    for judgment in loggedJudgments:
        normedFeatures = [0 for _ in judgment[2:]]
        for idx, feature in enumerate(judgment[2:]):
            normedFeatures[idx] = (float(feature) - minimums[idx]) / (maximums[idx] - minimums[idx])
        normedJudgment = judgment[:2]
        normedJudgment.extend(normedFeatures)
        normedJudgments.append(normedJudgment)
    
    return minimums, maximums, normedJudgments

In [22]:
minimums, maximums, normalizedJudgments = normalizeFeatures(judgments)

In [23]:
CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures.csv', normalizedJudgments)

File written: ./submission/ranksvm/data/minMaxNormalizedFeatures.csv


In [24]:
from itertools import groupby

def pairwiseTransformation(normalizedJudgments):
    
    predictorDeltas = []
    featureDeltas = []
    
    for qid, queryJudgments in groupby(normalizedJudgments, key=lambda j: j[1]):
    
        queryJudgmentsCopy1 = list(queryJudgments)
        queryJudgmentsCopy2 = list(queryJudgmentsCopy1)
        
        # Examine every judgment combo for this query, 
        # if they're different, store the pairwise difference:
        # +1 if judgment1 more relevant
        # -1 if judgment2 more relevant
        for judgment1 in queryJudgmentsCopy1:
            
            for judgment2 in queryJudgmentsCopy2:
                
                j1_features=np.array(judgment1[2:])
                j2_features=np.array(judgment2[2:])
                
                if int(judgment1[0]) > int(judgment2[0]):
                    predictorDeltas.append(+1)
                    featureDeltas.append(j1_features-j2_features)
                    
                elif int(judgment1[0]) < int(judgment2[0]):
                    predictorDeltas.append(-1)
                    featureDeltas.append(j1_features-j2_features)
                    
    # For training purposes, we return these as numpy arrays
    return np.array(featureDeltas), np.array(predictorDeltas)
        
        

In [25]:
featureDeltas, predictorDeltas = pairwiseTransformation(normalizedJudgments)

In [26]:
from sklearn.model_selection import train_test_split

featureDeltas_train, featureDeltas_test, predictorDeltas_train, predictorDeltas_test = train_test_split(
    featureDeltas, predictorDeltas, test_size=0.2, shuffle=False)

In [27]:
CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures_featureDeltasTrain.csv', featureDeltas_train)
CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures_featureDeltasTest.csv', featureDeltas_test)

CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures_predictorDeltasTrain.csv', predictorDeltas_train)
CorpusApi.saveListAsFile('./submission/ranksvm/data/minMaxNormalizedFeatures_predictorDeltasTest.csv', predictorDeltas_test)

File written: ./submission/ranksvm/data/minMaxNormalizedFeatures_featureDeltasTrain.csv
File written: ./submission/ranksvm/data/minMaxNormalizedFeatures_featureDeltasTest.csv
File written: ./submission/ranksvm/data/minMaxNormalizedFeatures_predictorDeltasTrain.csv
File written: ./submission/ranksvm/data/minMaxNormalizedFeatures_predictorDeltasTest.csv


In [62]:
from sklearn import svm
model = svm.LinearSVC(max_iter=1000000000, verbose=1, C=100, penalty="l2", loss='squared_hinge')
model.fit(featureDeltas_train, predictorDeltas_train)


[LibLinear]..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...*.*..*.*.**

# Validation should be here

In [64]:
model.score(featureDeltas_test, predictorDeltas_test)

0.9522700814901047

# Solr model implementation

In [65]:
import math
import requests
import json

modelName = 'ranksvm-100-squared_hinge'
linearModel = {
  "store": "thesis-ltr",
  "class": "org.apache.solr.ltr.model.LinearModel",
  "name": modelName,
  "features": [
  ],
  "params": {
      "weights": {
      }
  }
}

featureNames = ["title_coveredQueryTerms",
    "headings_coveredQueryTerms",
    "body_coveredQueryTerms",
    "document_coveredQueryTerms",
    "title_coveredQueryTermsRatio",
    "headings_coveredQueryTermsRatio",
    "body_coveredQueryTermsRatio",
    "document_coveredQueryTermsRatio",
    "title_tf",
    "headings_tf",
    "body_tf",
    "document_tf",
    "title_idf",
    "headings_idf",
    "body_idf",
    "document_idf",
    "title_tfidf",
    "headings_tfidf",
    "body_tfidf",
    "document_tfidf",
    "title_bm25",
    "headings_bm25",
    "body_bm25",
    "document_bm25"
]

for idx, featureName in enumerate(featureNames):
    config = {
        "name": featureName,
        "norm": {
            "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
            "params": {
                "min": str(minimums[idx]),
                "max": str(maximums[idx])
            }
        }
    }
    linearModel['features'].append(config)
    linearModel['params']['weights'][featureName] =  model.coef_[0][idx] 

print(json.dumps(linearModel, indent=2))

# Upload the model after deleting the model
print('Delete')
requests.delete(f'http://localhost:8983/solr/thesis-ltr/schema/model-store/{modelName}').json()
print('Add new model')
requests.put(f'http://localhost:8983/solr/thesis-ltr/schema/model-store', json=linearModel).json()

{
  "store": "thesis-ltr",
  "class": "org.apache.solr.ltr.model.LinearModel",
  "name": "ranksvm-100-squared_hinge",
  "features": [
    {
      "name": "title_coveredQueryTerms",
      "norm": {
        "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
        "params": {
          "min": "0.0",
          "max": "22.0"
        }
      }
    },
    {
      "name": "headings_coveredQueryTerms",
      "norm": {
        "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
        "params": {
          "min": "0.0",
          "max": "31.0"
        }
      }
    },
    {
      "name": "body_coveredQueryTerms",
      "norm": {
        "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
        "params": {
          "min": "0.0",
          "max": "34.0"
        }
      }
    },
    {
      "name": "document_coveredQueryTerms",
      "norm": {
        "class": "org.apache.solr.ltr.norm.MinMaxNormalizer",
        "params": {
          "min": "0.0",
          "max": "34.0"
        }
    

{'responseHeader': {'status': 0, 'QTime': 621}}

In [66]:
# Writing to sample.json
with open(f'./submission/ranksvm/solrModels/{modelName}.json', "w") as jsonFile:
    json.dump(linearModel, jsonFile, indent=4)