### Importing Libraries

In [1]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import xgboost as xgb
import torch.nn as nn
import torch.optim as optim
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import ndcg_score
from sklearn.metrics import average_precision_score

### Loading the Data

In [2]:
def read_test_queries(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, names=['gid', 'query'])
    return df

def read_candidate_passages(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, names=['gid', 'pid', 'query', 'passage'])
    return df

def read_train_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df.columns = ['gid', 'pid', 'query', 'passage', 'relevance']
    df.relevance = df.relevance.astype('int')
    ones = df[df.relevance == 1]
    zeros = df[df.relevance == 0].head(ones.shape[0])
    df = pd.concat([zeros,ones],axis=0)
    df = df.sample(frac = 1)
    df.reset_index(drop=True,inplace=True)
    return df

def read_validation_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df.columns = ['gid', 'pid', 'query', 'passage', 'relevance']
    df.relevance = df.relevance.astype('int')
    ones = df[df.relevance == 1]
    zeros = df[df.relevance == 0].head(ones.shape[0])
    df = pd.concat([zeros,ones],axis=0)
    df = df.sample(frac = 1)
    df.reset_index(drop=True,inplace=True)
    return df

In [3]:
test_queries = read_test_queries('test-queries.tsv')
candidate_passages = read_candidate_passages('candidate_passages_top1000.tsv')
train_data = read_train_data('train_data.tsv')
val_data = read_validation_data('validation_data.tsv')

In [4]:
test_queries

Unnamed: 0,gid,query
0,1108939,what slows down the flow of blood
1,1112389,"what is the county for grand rapids, mn"
2,792752,what is ruclip
3,1119729,what do you do when you have a nosebleed from ...
4,1105095,where is sugar lake lodge located
...,...,...
195,146187,difference between a mcdouble and a double che...
196,634428,what does chs stand for?
197,1121986,what are the effects of having low blood sugar
198,321441,how much is a us postal stamp cost


In [5]:
candidate_passages

Unnamed: 0,gid,pid,query,passage
0,494835,7130104,"sensibilities, definition",This is the definition of RNA along with examp...
1,1128373,7130104,iur definition,This is the definition of RNA along with examp...
2,131843,7130104,definition of a sigmet,This is the definition of RNA along with examp...
3,20455,7130335,ar glasses definition,Best Answer: The AR designation comes from the...
4,719381,7130335,what is ar balance,Best Answer: The AR designation comes from the...
...,...,...,...,...
189872,1056204,79980,who was the first steam boat operator,Other operators with special formats accept mo...
189873,1132213,7998257,how long to hold bow in yoga,You may be surprised that to learn that yoga t...
189874,324211,7998651,how much money a united airline get as a capta...,Find cheap airline tickets & deals on flights ...
189875,1116341,7998709,closed ended mortgage definition,"What is a wrap-around mortgage, and who is it ..."


In [6]:
train_data

Unnamed: 0,gid,pid,query,passage,relevance
0,641156,1061480,what does it mean when someone sign a document...,The sender receives an email with the signed d...,0
1,1028796,1061356,what is vulnerable animals,"The lion is a vulnerable species, having seen ...",0
2,1013424,7242112,what values do zoos serve,Zoos serve several puposes depending on who yo...,1
3,996042,7945160,where is the house of myrtlewood,"House of Myrtlewood Factory Tour - Coos Bay, O...",1
4,1088715,1008742,weather for tennessee in april,"Weekly Weather Report for Cairo, Egypt. Lookin...",0
...,...,...,...,...,...
9589,544277,1058939,weather in france in july,Averages for Barcelona in July. Weather lowdow...,0
9590,70340,1068200,can inheritance monies be taken by the trustee...,AFTER YOU HAVE RECEIVED DISCHARGE IN CHAPTER 1...,0
9591,1037689,1872166,what is the legal definition of deposit,Definition of deposit. 1 1 : the state of bei...,1
9592,194724,1073551,gesundheit meaning,The name Reina is a Yiddish baby name. In Yidd...,0


In [7]:
val_data

Unnamed: 0,gid,pid,query,passage,relevance
0,1045567,1023381,what is super duplex stainless steel,"However, you will need to use a stainless stee...",0
1,996825,7271167,where is vanderbilt located?,Vanderbilt is a census-designated place (CDP) ...,1
2,117036,1058287,define a multichannel radio,Pulsars beam intense radio beams of radio wave...,0
3,458774,4589509,most popular news networks,The poll found 29 percent of people say they t...,1
4,573899,4792479,what are the steelers worth,"Pittsburgh Steelers Net Worth is 1,100 $Millio...",1
...,...,...,...,...,...
2411,1054969,1029023,what is freddie mac first look initiative,A jumbo loan provides financing for loan amoun...,0
2412,181222,7330660,equi meaning,equi-Meaning equal. Related terms . uguale; eq...,1
2413,226132,100318,how does environment affect health,"Climate can affect day-to-day activities, heal...",0
2414,849561,7737351,what is the symptoms of pleurisy,The list of signs and symptoms mentioned in va...,1


### Data Preprocessing

**Loading GloVe embeddings**

In [8]:
embeddings_dict = {}
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_dict[word] = vector

**Defining function to generate query/passages embeddings**

In [9]:
def generate_embeddings(data, embeddings_dict):
    """
    Generates word embeddings for given text data using pre-trained embeddings dictionary
    
    Args:
    data (list): A list of texts to generate embeddings for
    
    Returns:
    np.ndarray: A 2D numpy array of shape (len(data), 50) containing word embeddings for each text in data.
    """
    embeddings = []
    for text in data:
        words = text.split()
        text_embeddings = []
        for word in words:
            if word in embeddings_dict:
                text_embeddings.append(embeddings_dict[word])
        if len(text_embeddings) == 0:
            text_embeddings.append(np.zeros(50))
        embeddings.append(np.mean(text_embeddings, axis=0))
    return np.asarray(embeddings)

**Combining query and passage features**

In [10]:
train = train_data['query'] + ' ' + train_data['passage']
val = val_data['query'] + ' ' + val_data['passage']

train_features = generate_embeddings(train, embeddings_dict)
val_features = generate_embeddings(val, embeddings_dict)

**Defining the target variable as the relevance score**

In [11]:
train_relevance = train_data['relevance']
val_relevance = val_data['relevance']

**Normalizing the input features**

In [12]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
val_features = scaler.transform(val_features)

### Task 1: Evaluating Retrieval Quality

In [13]:
def average_precision_score(y_true, y_pred):
    """
    Calculates the average precision score for a binary classification problem.

    Parameters:
    -----------
    y_true : array-like, shape (n_samples,)
        True binary labels.

    y_pred : array-like, shape (n_samples,)
        Estimated binary labels.

    Returns:
    --------
    average_precision : float
        Average precision score.
    """
    n_samples = len(y_true)
    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[sorted_indices]
    tp = np.cumsum(y_true_sorted)
    precision = tp / np.arange(1, n_samples+1)
    average_precision = np.sum(precision * y_true_sorted) / np.sum(y_true_sorted)

    return average_precision


def ndcg_score(y_true, y_pred, k=None):
    """
    Calculates the normalized discounted cumulative gain score for a ranking problem.

    Parameters:
    -----------
    y_true : array-like, shape (n_samples,)
        True relevance scores.

    y_pred : array-like, shape (n_samples,)
        Estimated relevance scores.

    k : int or None, optional (default=None)
        The maximum number of samples to consider.

    Returns:
    --------
    ndcg : float
        Normalized discounted cumulative gain score.
    """
    if k is None:
        k = len(y_true)

    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[sorted_indices]
    dcg = np.sum(y_true_sorted[:k] / np.log2(np.arange(2, k+2)))

    ideal_indices = np.argsort(y_true)[::-1]
    ideal_sorted = y_true[ideal_indices]
    idcg = np.sum(ideal_sorted[:k] / np.log2(np.arange(2, k+2)))

    ndcg = dcg / idcg

    return ndcg

**Computing the performance of BM25 as the retrieval model on validation data**

In [14]:
tokenized_corpus = [doc.split(" ") for doc in val_data['passage']]
tokenized_queries = [doc.split(" ") for doc in val_data['query']]

bm25 = BM25Okapi(tokenized_corpus)

scores = []
rankings = []
for query in tokenized_queries:
    query_scores = bm25.get_scores(query)
    query_rankings = np.argsort(-query_scores)
    scores.append(query_scores)
    rankings.append(query_rankings)

# Flatten scores and rankings
flat_scores = np.concatenate(scores)
flat_rankings = np.concatenate(rankings)

# Create relevance labels
relevance_labels = np.zeros(len(flat_scores))
for i, doc_id in enumerate(val_data.pid):
    if doc_id in val_data.loc[val_data.gid == val_data.gid[i], 'pid'].values[:10]:
        relevance_labels[i] = 1

# Compute average precision score
ap_score = average_precision_score(relevance_labels, flat_scores)
print("Average Precision Score:", ap_score)

# Compute NDCG score
ndcg = ndcg_score(relevance_labels, flat_scores, k=10)
print("NDCG Score:", ndcg)

Average Precision Score: 0.00041687844922848354
NDCG Score: 0.0


### Task 2: Logistic Regression

**Defining and Training the Model**

In [15]:
lr_model = LogisticRegression(random_state=42,n_jobs=-1,max_iter=1000)
lr_model.fit(train_features, train_relevance)

predictions = lr_model.predict(val_features)
accuracy = accuracy_score(val_relevance, predictions)

**Evaluating the performance of the model on the validation set**

In [16]:
print(f"Accuracy on validation set: {accuracy}")
print("Avg. Precision on validation set:", average_precision_score(val_relevance, predictions))
print("NDCG Score on validation set:", ndcg_score(val_relevance, predictions))

Accuracy on validation set: 0.5728476821192053
Avg. Precision on validation set: 0.5703126395885824
NDCG Score on validation set: 0.9148335887987036


### Task 3: LambdaMART Model

**Defining and Training the Model**

In [17]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.5],
    'gamma': [0.0, 0.5, 1.0],
    'min_child_weight': [0.1, 1.0, 10.0],
    'max_depth': [3, 5, 7],
    'subsample': [0.5, 0.75, 1.0],
    'colsample_bytree': [0.5, 0.75, 1.0],
}

# Define LambdaMART model
params = {
    'objective': 'rank:ndcg',
    'learning_rate': 10.,
    'gamma': 0.0,
    'min_child_weight': 0.1,
    'max_depth': 7,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
}

# Train LambdaMART model
dtrain = xgb.DMatrix(train_features, label=train_relevance)
dval = xgb.DMatrix(val_features, label=val_relevance)
lm_model = xgb.train(params, dtrain)

# Evaluate LambdaMART model on validation data
val_preds = lm_model.predict(dval)
val_data['lm_rank'] = val_preds
val_data = val_data.sort_values(['query', 'lm_rank'], ascending=[True, False])
val_grouped = val_data.groupby('query')
y_true = []
y_score = []
for name, group in val_grouped:
    y_true.extend(group['relevance'].values)
    y_score.extend(group['relevance'].rank(method='first', ascending=False).values)
    
y_true = np.array(y_true)
y_score = np.array(y_score)

print("NDCG Score:", ndcg_score(y_true, y_score))
print('MAP@10:', average_precision_score(y_true, y_score))

NDCG Score: 0.7820018607672368
MAP@10: 0.3073915272609424


**Evaluating the performance of the model on the validation set**

In [18]:
print("NDCG Score:", ndcg_score(y_true, y_score))
print('MAP@10:', average_precision_score(y_true, y_score))

NDCG Score: 0.7820018607672368
MAP@10: 0.3073915272609424


### Task 4: Neural Network Model

In [19]:
# Define neural network model
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = nn.functional.softmax(out, dim=1)
        return out

# Define training loop
def train_model(model, optimizer, loss_fn, train_loader, num_epochs):
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels.long())  # Convert labels to Long type
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

# Define evaluation function
def evaluate(model, dataloader):
    model.eval()
    all_predictions = []
    all_labels = []
    all_outputs = []
    with torch.no_grad():
        for data in dataloader:
            inputs, labels = data
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.numpy())
            all_labels.extend(labels.numpy())
            all_outputs.extend(outputs.numpy())
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    all_outputs = np.array(all_outputs)
    acc = accuracy_score(all_labels, all_predictions)
    ndcg = ndcg_score(all_labels, all_predictions)
    mAP = average_precision_score(all_labels, all_outputs[:, 1].flatten())
    return acc, ndcg, mAP

# Set random seed for reproducibility
torch.manual_seed(42)

# Convert data to PyTorch tensors and create data loaders
train_tensor = torch.utils.data.TensorDataset(torch.tensor(train_features), torch.tensor(train_relevance))
train_loader = torch.utils.data.DataLoader(train_tensor, batch_size=32, shuffle=True)
val_tensor = torch.utils.data.TensorDataset(torch.tensor(val_features), torch.tensor(val_relevance))
val_loader = torch.utils.data.DataLoader(val_tensor, batch_size=32, shuffle=False)

# Initialize model and optimizer
nn_model = Net(input_size=train_features.shape[1], hidden_size=16, output_size=2)
optimizer = optim.SGD(nn_model.parameters(), lr=0.1, momentum=0.9)
loss_fn = nn.CrossEntropyLoss()

# Train model
train_model(nn_model, optimizer, loss_fn, train_loader, num_epochs=100)

# Evaluate model on validation data
val_acc, val_ndcg, val_mAP = evaluate(nn_model, val_loader)
print(f"\nValidation Accuracy: {val_acc:.4f}")
print(f"Validation NDCG Score: {val_ndcg:.4f}")
print(f"Validation Mean Average Precision: {val_mAP:.4f}")

Epoch 1, Loss: 0.6787
Epoch 2, Loss: 0.6694
Epoch 3, Loss: 0.6638
Epoch 4, Loss: 0.6618
Epoch 5, Loss: 0.6561
Epoch 6, Loss: 0.6547
Epoch 7, Loss: 0.6485
Epoch 8, Loss: 0.6470
Epoch 9, Loss: 0.6475
Epoch 10, Loss: 0.6438
Epoch 11, Loss: 0.6431
Epoch 12, Loss: 0.6375
Epoch 13, Loss: 0.6344
Epoch 14, Loss: 0.6343
Epoch 15, Loss: 0.6355
Epoch 16, Loss: 0.6291
Epoch 17, Loss: 0.6310
Epoch 18, Loss: 0.6273
Epoch 19, Loss: 0.6276
Epoch 20, Loss: 0.6244
Epoch 21, Loss: 0.6246
Epoch 22, Loss: 0.6241
Epoch 23, Loss: 0.6230
Epoch 24, Loss: 0.6189
Epoch 25, Loss: 0.6191
Epoch 26, Loss: 0.6161
Epoch 27, Loss: 0.6205
Epoch 28, Loss: 0.6180
Epoch 29, Loss: 0.6119
Epoch 30, Loss: 0.6152
Epoch 31, Loss: 0.6128
Epoch 32, Loss: 0.6156
Epoch 33, Loss: 0.6139
Epoch 34, Loss: 0.6115
Epoch 35, Loss: 0.6094
Epoch 36, Loss: 0.6100
Epoch 37, Loss: 0.6109
Epoch 38, Loss: 0.6080
Epoch 39, Loss: 0.6134
Epoch 40, Loss: 0.6103
Epoch 41, Loss: 0.6114
Epoch 42, Loss: 0.6213
Epoch 43, Loss: 0.6156
Epoch 44, Loss: 0.61

### Submission of Test Results

In [20]:
def generate_submission_file(model, model_name, test_queries, test_passages, output_path, embeddings_dict):
    """
    Generates a submission file containing the predicted ranking scores for each query-passage pair in the test set.

    Args:
        model (object): The trained machine learning model to use for ranking.
        model_name (str): The name of the machine learning model used.
        test_queries (str): The test queries file.
        test_passages (str): The test passages file.
        output_path (str): The file path to save the generated submission file.

    Returns:
        None
    """

    # Combine queries and passages
    test_data = test_queries.merge(test_passages, on='gid').drop('query_y', axis=1)
    
    # Generate features
    test = test_data['query_x'] + ' ' + test_data['passage']
    test_features = generate_embeddings(test, embeddings_dict)

    # Normalize the input features
    scaler = StandardScaler()
    test_features = scaler.fit_transform(test_features)
    
    # Generate predictions
    if model_name == 'LM':
        test_features = xgb.DMatrix(test_features)
        predictions = model.predict(test_features)
        scores = 1 / (1 + np.exp(-predictions))
#         predictions = (scores >= 0.5).astype(int)
    elif model_name == 'NN':
        test_features = torch.from_numpy(test_features).float()
        with torch.no_grad():
            predictions = model(test_features)
            scores = predictions[:, 1].numpy()
            predictions = np.argmax(predictions, axis=1).numpy()
    else:
        predictions = model.predict(test_features)
        scores = model.predict_proba(test_features)[:, 1]

    # Combine predictions with query and passage information
    results = test_data[['gid', 'pid']].copy()
    results['rank'] = predictions
    results['score'] = scores
    results['algoname'] = model_name

    # Rank passages within each query
    results['rank'] = results.groupby('gid')['score'].rank(method='dense', ascending=False).astype(int)

    # Write results to file
    with open(output_path, 'w') as f:
        for qid in results['gid'].unique():
            query_results = results[results['gid'] == qid].sort_values('rank')
            for _, row in query_results.iterrows():
                line = f"{row['gid']} A2 {row['pid']} {row['rank']} {row['score']} {model_name}\n"
                f.write(line)

In [21]:
# Generate LR submission file
generate_submission_file(lr_model, 'LR', test_queries, candidate_passages, 'LR.txt', embeddings_dict)

# Generate LambdaMart submission file
generate_submission_file(lm_model, 'LM', test_queries, candidate_passages, 'LM.txt', embeddings_dict)

# Generate NN submission file
generate_submission_file(nn_model, 'NN', test_queries, candidate_passages, 'NN.txt', embeddings_dict)