In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


DATA PREPROCESSING

In [2]:
import pandas as pd

def preprocess_data_from_file(file_path):
    # Initialize lists to store data
    features_list = []
    relevance_list = []

    # Read data from the file
    with open(file_path, 'r') as file:
        data = file.readlines()

    # Process each line of data
    for line in data:
        # Split by space
        parts = line.split(" ")
        # Extract qid
        qid = parts[0]  # Assuming qid is the first part of each line
        # Extract relevance (removing trailing newline character)
        relevance = parts[-1].strip()
        # Extract features
        features = parts[1:-4]  # Assuming features are between qid and relevance
        # Append to respective lists
        features_list.append(features)
        relevance_list.append(relevance)

    # Convert features to DataFrame
    features_df = pd.DataFrame(features_list)

    # Add qid and relevance columns
    features_df.insert(0, "qid", qid)
    features_df["relevance"] = relevance_list

    # Rename columns
    columns = {i: f"feature{i+1}" for i in range(len(features_df.columns)-1)}
    features_df.rename(columns=columns, inplace=True)

    return features_df



# Example usage:
file_path1 = '/content/drive/My Drive/IE_506_RANKING/S1.txt'  # Update with your file path
features_df1 = preprocess_data_from_file(file_path1)

In [3]:
def preprocess_data(data):
    rows = []
    for row in data:
        if '#' in row:
            row = row.split('#')[0].strip()  # Exclude the part after '#'
        if not row:
            continue  # Skip empty lines
        parts = row.strip().split(' ')  # Split row by spaces
        relevance = int(parts[0])  # First number is relevance
        qid = int(parts[1].split(':')[1])  # Extract qid
        features = {}
        for part in parts[2:]:
            idx, val = part.split(':')
            features[int(idx)] = float(val)
        row_dict = {'qid': qid, 'relevance': relevance}
        row_dict.update(features)  # Add features to the row
        rows.append(row_dict)
    return pd.DataFrame(rows)

# Read data from a text file
file_path1 = '/content/drive/My Drive/IE_506_RANKING/S1.txt'  # Update with your file path
with open(file_path1, 'r') as file:
    data1 = file.readlines()

# Example usage:
df1 = preprocess_data(data1)

In [4]:
# Print the last column
last_column = features_df1.iloc[:, -1]
# Add the last column to the DataFrame at the last position
df1['prob'] = last_column
# Convert the last column from string to float
df1['prob'] = df1['prob'].astype(float)

In [5]:
df1.head(125)

Unnamed: 0,qid,relevance,1,2,3,4,5,6,7,8,...,38,39,40,41,42,43,44,45,46,prob
0,10,0,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.017241,0.000000,0.000000,0.0,0.024691
1,10,1,0.031310,0.666667,0.50,0.166667,0.033206,0.0,0.0,0.0,...,0.686107,0.823908,0.750092,0.385426,0.923077,0.086207,0.333333,0.448276,0.0,0.416367
2,10,1,0.078682,0.166667,0.50,0.333333,0.080022,0.0,0.0,0.0,...,0.578581,0.868557,0.641385,0.010462,0.076923,0.074713,0.833333,0.678161,0.0,0.568950
3,10,1,0.019058,1.000000,1.00,0.500000,0.022591,0.0,0.0,0.0,...,0.868457,1.000000,0.863460,0.016642,0.153846,0.040230,0.833333,0.896552,0.0,0.775913
4,10,0,0.039477,0.000000,0.75,0.166667,0.040555,0.0,0.0,0.0,...,0.569440,0.769845,0.646567,0.073711,0.076923,0.034483,0.333333,0.218391,0.0,0.334800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,34,0,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.078947,0.000000,0.000000,0.0,0.060317
121,34,0,0.074129,0.000000,0.00,0.000000,0.074129,0.0,0.0,0.0,...,0.695368,0.718155,0.647774,0.011770,0.150000,0.842105,0.571429,0.915254,0.0,0.136433
122,34,0,0.034099,0.000000,0.00,0.000000,0.034099,0.0,0.0,0.0,...,0.708105,0.710384,0.711672,1.000000,1.000000,0.342105,0.714286,0.576271,0.0,0.175514
123,34,0,0.074129,0.000000,0.00,0.000000,0.074129,0.0,0.0,0.0,...,0.707261,0.727017,0.660234,0.001154,0.050000,0.026316,0.571429,1.000000,0.0,0.158908


In [6]:
import pandas as pd
from itertools import permutations

def pairwise_ranking(df):
    output_data = []

    # Group the DataFrame by 'qid'
    grouped = df.groupby('qid')

    # Iterate over each group
    for qid, group in grouped:
        # Generate all permutations of pairs within the group
        pairs = permutations(group.iterrows(), 2)

        # Iterate over each pair
        for (idx1, doc1), (idx2, doc2) in pairs:
            # Extract relevance and prob values for each document
            relevance1, relevance2 = doc1['relevance'], doc2['relevance']
            prob1, prob2 = doc1['prob'], doc2['prob']

            # Extract features for doc1 and doc2
            features1 = doc1.drop(['qid', 'relevance', 'prob'])
            features2 = doc2.drop(['qid', 'relevance', 'prob'])

            # Determine output label based on relevance and prob values
            if relevance1 > relevance2:
                output_label = 1
            elif relevance1 < relevance2:
                output_label = 0
            else:
                output_label = 1 if prob1 > prob2 else 0

            # Append the result to the output data
            output_data.append({'qid': qid, 'doc1': idx1, 'doc2': idx2, 'output_label': output_label,
                                **{f'feature_{i}': features1[i] for i in range(1, 47)},
                                **{f'feature_{i+46}': features2[i] for i in range(1, 47)}})

    # Create DataFrame from output data
    output_df = pd.DataFrame(output_data)
    return output_df


In [7]:
# Example usage:
# Assuming 'df' is the DataFrame containing qid, features, and relevance level
output_df1 = pairwise_ranking(df1)

In [8]:
output_df1

Unnamed: 0,qid,doc1,doc2,output_label,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_83,feature_84,feature_85,feature_86,feature_87,feature_88,feature_89,feature_90,feature_91,feature_92
0,10,0,1,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.646890,0.686107,0.823908,0.750092,0.385426,0.923077,0.086207,0.333333,0.448276,0.0
1,10,0,2,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.649824,0.578581,0.868557,0.641385,0.010462,0.076923,0.074713,0.833333,0.678161,0.0
2,10,0,3,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.918403,0.868457,1.000000,0.863460,0.016642,0.153846,0.040230,0.833333,0.896552,0.0
3,10,0,4,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.565875,0.569440,0.769845,0.646567,0.073711,0.076923,0.034483,0.333333,0.218391,0.0
4,10,0,5,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.912987,0.851143,0.676743,0.871849,0.001705,0.076923,0.000000,0.500000,0.425287,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581381,1828,14012,14007,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.705032,0.983064,0.874043,0.983256,0.000118,0.000713,0.152174,0.500000,0.129252,0.0
581382,1828,14012,14008,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.253665,0.519413,0.497421,0.505797,0.000000,0.000000,0.782609,0.500000,0.081633,0.0
581383,1828,14012,14009,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.498324,0.665563,0.793696,0.682627,0.013283,0.001427,0.021739,0.500000,0.190476,0.0
581384,1828,14012,14010,0,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.603386,0.906406,0.929225,0.873596,0.000061,0.000713,0.000000,0.500000,0.115646,0.0


Adding rel1,2 and prob1,2 values in output_df1

In [9]:
import pandas as pd

# Assuming old_df has columns 'doc', 'relevance_label', and 'probability_score'
# Assuming df has columns 'doc1' and 'doc2'

# Create a copy of df
df_copy = output_df1.copy()

# Merge old_df with df_copy for relevance label and probability score of doc1
def get_values(doc_id):
    return df1.loc[doc_id, ['relevance', 'prob']]

# Iterate through each row of df and get relevance and probability for doc1
df_copy['rel1'], df_copy['prob1'] = zip(*df_copy['doc1'].map(get_values))


In [10]:
df_copy['rel2'], df_copy['prob2'] = zip(*df_copy['doc2'].map(get_values))

In [11]:
df_copy

Unnamed: 0,qid,doc1,doc2,output_label,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_87,feature_88,feature_89,feature_90,feature_91,feature_92,rel1,prob1,rel2,prob2
0,10,0,1,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.385426,0.923077,0.086207,0.333333,0.448276,0.0,0.0,0.024691,1.0,0.416367
1,10,0,2,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.010462,0.076923,0.074713,0.833333,0.678161,0.0,0.0,0.024691,1.0,0.568950
2,10,0,3,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.016642,0.153846,0.040230,0.833333,0.896552,0.0,0.0,0.024691,1.0,0.775913
3,10,0,4,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.073711,0.076923,0.034483,0.333333,0.218391,0.0,0.0,0.024691,0.0,0.334800
4,10,0,5,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.001705,0.076923,0.000000,0.500000,0.425287,0.0,0.0,0.024691,1.0,0.698286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581381,1828,14012,14007,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.000118,0.000713,0.152174,0.500000,0.129252,0.0,1.0,0.269725,1.0,0.103183
581382,1828,14012,14008,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.000000,0.000000,0.782609,0.500000,0.081633,0.0,1.0,0.269725,0.0,0.058078
581383,1828,14012,14009,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.013283,0.001427,0.021739,0.500000,0.190476,0.0,1.0,0.269725,0.0,0.165237
581384,1828,14012,14010,0,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.000061,0.000713,0.000000,0.500000,0.115646,0.0,1.0,0.269725,1.0,0.427653


K = no. of queries

In [12]:
K = df_copy['qid'].nunique()
print(K)

339


Computing DCG_beta scores corresponding to every  query

In [13]:
import numpy as np

def compute_DCG_beta(df):
    DCG_beta_scores = []
    for qid in df['qid'].unique():
        # Filter dataframe for the current qid
        qid_df = df[df['qid'] == qid]

        # Sort documents based on relevance label and probability score
        sorted_df = qid_df.sort_values(by=['relevance', 'prob'], ascending=[False, False])

        # Calculate DCG_beta for the selected documents
        DCG_beta = np.sum(sorted_df['relevance'] * (len(qid_df) - np.arange(1, len(qid_df) + 1)))

        DCG_beta_scores.append(DCG_beta)

    return DCG_beta_scores

# Call the function to compute DCG_beta scores
DCG_beta_scores = compute_DCG_beta(df1)


In [14]:
# Check the type of DCG_beta_scores
print(type(DCG_beta_scores))


<class 'list'>


In [15]:
def replace_zero_with_one_in_scores(scores):
    # Create a copy of the list
    modified_scores = scores.copy()
    # Replace zeros with ones
    modified_scores = [1 if score == 0 else score for score in modified_scores]
    return modified_scores

# Replace zeros with ones
modified_DCG_beta_scores = replace_zero_with_one_in_scores(DCG_beta_scores)


In [16]:
len(DCG_beta_scores)

339

In [17]:
unique_qids = df1['qid'].unique()

In [18]:
qid_to_dcg_beta = dict(zip(unique_qids, modified_DCG_beta_scores))

# Map the DCG beta scores to the corresponding qid values in df_copy
df_copy['DCG_beta'] = df_copy['qid'].map(qid_to_dcg_beta)

In [19]:
df_copy

Unnamed: 0,qid,doc1,doc2,output_label,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_88,feature_89,feature_90,feature_91,feature_92,rel1,prob1,rel2,prob2,DCG_beta
0,10,0,1,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.923077,0.086207,0.333333,0.448276,0.0,0.0,0.024691,1.0,0.416367,504
1,10,0,2,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.076923,0.074713,0.833333,0.678161,0.0,0.0,0.024691,1.0,0.568950,504
2,10,0,3,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.153846,0.040230,0.833333,0.896552,0.0,0.0,0.024691,1.0,0.775913,504
3,10,0,4,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.076923,0.034483,0.333333,0.218391,0.0,0.0,0.024691,0.0,0.334800,504
4,10,0,5,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.076923,0.000000,0.500000,0.425287,0.0,0.0,0.024691,1.0,0.698286,504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581381,1828,14012,14007,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.000713,0.152174,0.500000,0.129252,0.0,1.0,0.269725,1.0,0.103183,252
581382,1828,14012,14008,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.000000,0.782609,0.500000,0.081633,0.0,1.0,0.269725,0.0,0.058078,252
581383,1828,14012,14009,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.001427,0.021739,0.500000,0.190476,0.0,1.0,0.269725,0.0,0.165237,252
581384,1828,14012,14010,0,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.000713,0.000000,0.500000,0.115646,0.0,1.0,0.269725,1.0,0.427653,252


Writing custom loss function for neural netwrk with L2 regularization.

In [20]:
import tensorflow as tf

# Define the custom loss function using vectorized operations
def custom_loss(y_true, y_pred, df, K=339.0):
    # Extract relevant columns from the dataframe
    rel1 = df['rel1']
    rel2 = df['rel2']
    prob1 = df['prob1']
    prob2 = df['prob2']
    DCG_beta = df['DCG_beta']

    # Calculate the difference in relevance scores
    score_diff = tf.abs(rel1 + prob1 - rel2 - prob2)

    # Calculate tau_i using vectorized operations
    tau_i = score_diff / (K * DCG_beta)

    # Cast tau_i, y_pred, and y_true to float32
    tau_i = tf.cast(tau_i, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    y_true = tf.cast(y_true, tf.float32)

    # Compute the loss for each row using vectorized operations
    loss = tau_i * (y_true * tf.square(1 - y_pred) + (1 - y_true) * tf.square(y_pred))

    # Sum up the losses for all rows
    total_loss = tf.reduce_sum(loss)

    return total_loss

# List of columns to be dropped from dataframe (assuming they're not needed for training)
cols_to_drop = ['output_label', 'qid', 'doc1', 'doc2', 'rel1', 'rel2', 'prob1', 'prob2']

# Load your dataframe here
# Replace this with your actual code to load the dataframe
# df = pd.read_csv('your_data.csv')  # Assuming your data is in a CSV file

# Drop the specified columns to get relevant data for training
X_train = df_copy.drop(cols_to_drop, axis=1).values
y_train = df_copy['output_label'].values

# Define the neural network model with L2 regularization
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01), input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output probability P(i,j)
])

# Compile the model with custom loss function and accuracy metric
model.compile(optimizer='adam', loss=lambda y_true, y_pred: custom_loss(y_true, y_pred, df_copy), metrics=['accuracy'])

# Train the model using all the training data
model.fit(X_train, y_train, epochs=10, batch_size=32)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d0ef6e77f10>

In [21]:
# Assuming X_test contains the features of new data for which you want to predict probabilities
probabilities = model.predict(X_train)

# Display probabilities
print(probabilities)


[[0.05301875]
 [0.05771547]
 [0.03057352]
 ...
 [0.6629256 ]
 [0.4247882 ]
 [0.67728055]]


In [22]:
print(probabilities.shape)

(581386, 1)


In [23]:
# Assuming probabilities is the numpy array containing the probabilities
df_copy['probabilities'] = probabilities

In [24]:
df_copy

Unnamed: 0,qid,doc1,doc2,output_label,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_89,feature_90,feature_91,feature_92,rel1,prob1,rel2,prob2,DCG_beta,probabilities
0,10,0,1,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.086207,0.333333,0.448276,0.0,0.0,0.024691,1.0,0.416367,504,0.053019
1,10,0,2,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.074713,0.833333,0.678161,0.0,0.0,0.024691,1.0,0.568950,504,0.057715
2,10,0,3,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.040230,0.833333,0.896552,0.0,0.0,0.024691,1.0,0.775913,504,0.030574
3,10,0,4,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.034483,0.333333,0.218391,0.0,0.0,0.024691,0.0,0.334800,504,0.073051
4,10,0,5,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.500000,0.425287,0.0,0.0,0.024691,1.0,0.698286,504,0.066921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581381,1828,14012,14007,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.152174,0.500000,0.129252,0.0,1.0,0.269725,1.0,0.103183,252,0.486872
581382,1828,14012,14008,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.782609,0.500000,0.081633,0.0,1.0,0.269725,0.0,0.058078,252,0.851478
581383,1828,14012,14009,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.021739,0.500000,0.190476,0.0,1.0,0.269725,0.0,0.165237,252,0.662926
581384,1828,14012,14010,0,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.000000,0.500000,0.115646,0.0,1.0,0.269725,1.0,0.427653,252,0.424788


In [25]:
import pandas as pd

# Assuming merged_df is your DataFrame containing the 'doc1' and 'probabilities' columns

# Calculate the sum of probabilities for each unique value in the 'doc1' column
sum_probabilities = df_copy.groupby('doc1')['probabilities'].sum()

# Map the sum of probabilities back to the original DataFrame
df_copy['sum_probabilities'] = df_copy['doc1'].map(sum_probabilities)


In [26]:
df_copy

Unnamed: 0,qid,doc1,doc2,output_label,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_90,feature_91,feature_92,rel1,prob1,rel2,prob2,DCG_beta,probabilities,sum_probabilities
0,10,0,1,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.333333,0.448276,0.0,0.0,0.024691,1.0,0.416367,504,0.053019,4.000304
1,10,0,2,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.833333,0.678161,0.0,0.0,0.024691,1.0,0.568950,504,0.057715,4.000304
2,10,0,3,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.833333,0.896552,0.0,0.0,0.024691,1.0,0.775913,504,0.030574,4.000304
3,10,0,4,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.333333,0.218391,0.0,0.0,0.024691,0.0,0.334800,504,0.073051,4.000304
4,10,0,5,0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.500000,0.425287,0.0,0.0,0.024691,1.0,0.698286,504,0.066921,4.000304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581381,1828,14012,14007,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.500000,0.129252,0.0,1.0,0.269725,1.0,0.103183,252,0.486872,23.553329
581382,1828,14012,14008,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.500000,0.081633,0.0,1.0,0.269725,0.0,0.058078,252,0.851478,23.553329
581383,1828,14012,14009,1,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.500000,0.190476,0.0,1.0,0.269725,0.0,0.165237,252,0.662926,23.553329
581384,1828,14012,14010,0,0.009058,0.0,0.0,0.0,0.009056,0.0,...,0.500000,0.115646,0.0,1.0,0.269725,1.0,0.427653,252,0.424788,23.553329


In [27]:
import pandas as pd

def create_new_dataframe(df):
    # Select unique combinations of qid, doc1, and sum_probabilities
    unique_df = df[['qid', 'doc1', 'sum_probabilities']].drop_duplicates('doc1').reset_index(drop=True)

    return unique_df

# Example usage:
new_df = create_new_dataframe(df_copy)



In [28]:
new_df

Unnamed: 0,qid,doc1,sum_probabilities
0,10,0,4.000304
1,10,1,18.113077
2,10,2,19.261211
3,10,3,24.954279
4,10,4,15.880794
...,...,...,...
14008,1828,14008,10.826729
14009,1828,14009,17.640949
14010,1828,14010,22.682821
14011,1828,14011,17.326057


In [29]:
import pandas as pd

def merge_relevance_and_prob_columns(new_df, df1):
    # Concatenate the relevance and prob columns from df1 to new_df
    merged_df = pd.concat([new_df, df1[['relevance', 'prob']]], axis=1)

    return merged_df

# Example usage:
merged_new_df = merge_relevance_and_prob_columns(new_df, df1)

In [30]:
merged_new_df

Unnamed: 0,qid,doc1,sum_probabilities,relevance,prob
0,10,0,4.000304,0,0.024691
1,10,1,18.113077,1,0.416367
2,10,2,19.261211,1,0.568950
3,10,3,24.954279,1,0.775913
4,10,4,15.880794,0,0.334800
...,...,...,...,...,...
14008,1828,14008,10.826729,0,0.058078
14009,1828,14009,17.640949,0,0.165237
14010,1828,14010,22.682821,1,0.427653
14011,1828,14011,17.326057,1,0.158485


Calculating DCG score for ideal case

In [49]:
import numpy as np

def compute_dcg(df, k=15):
    dcg_scores = []
    for qid in df['qid'].unique():
        # Filter dataframe for the current qid
        qid_df = df[df['qid'] == qid]

        # Sort documents based on relevance label and probability score
        sorted_df = qid_df.sort_values(by=['relevance', 'prob'], ascending=[False, False])

        # Select top k documents
        top_k_df = sorted_df.head(k)

        # Calculate DCG for the selected documents
        dcg = np.sum((2 ** top_k_df['relevance'] - 1) / np.log2(np.arange(2, k + 2)))

        dcg_scores.append(dcg)

    return dcg_scores

# Call the function to compute DCG score for the ideal case
ideal_dcg_scores = compute_dcg(merged_new_df)


In [50]:
ideal_dcg_scores

[5.8613479980138035,
 9.08436179088239,
 3.0,
 1.6309297535714575,
 9.850382753865262,
 5.8613479980138035,
 10.215953061456498,
 1.0,
 8.35459994530971,
 3.0,
 5.8613479980138035,
 5.8613479980138035,
 2.1309297535714578,
 7.215324023249391,
 10.215953061456498,
 0.0,
 4.254494511770457,
 4.948459118879392,
 0.0,
 9.873207505156719,
 4.892789260714372,
 7.210318626022307,
 7.092740438166795,
 5.8613479980138035,
 4.543559338088345,
 4.948459118879392,
 0.0,
 8.516354018913372,
 4.822502283739475,
 6.823465818787765,
 0.0,
 11.75826623577259,
 5.6113479980138035,
 5.8613479980138035,
 7.516354018913372,
 4.561606311644851,
 4.822502283739475,
 2.9484591188793923,
 12.450488370301299,
 2.5616063116448506,
 7.566525813130329,
 2.1309297535714578,
 3.6379996393207477,
 5.8613479980138035,
 5.8613479980138035,
 16.046828874347394,
 5.8613479980138035,
 3.9534645161064765,
 3.304666305987414,
 17.584043994041412,
 7.899859146463662,
 15.506352565492755,
 1.6309297535714575,
 10.478602596493

Calculating mdel DCG scores

In [51]:
import numpy as np

def compute_model_dcg(df, k=15):
    dcg_scores = []
    for qid in df['qid'].unique():
        # Filter dataframe for the current qid
        qid_df = df[df['qid'] == qid]

        # Sort documents based on sum_probabilities
        sorted_df = qid_df.sort_values(by='sum_probabilities', ascending=False)

        # Select top k documents
        top_k_df = sorted_df.head(k)

        # Calculate DCG for the selected documents using relevance column
        dcg = np.sum((2 ** top_k_df['relevance'] - 1) / np.log2(np.arange(2, k + 2)))

        dcg_scores.append(dcg)

    return dcg_scores

# Call the function to compute DCG score for the model ranking
model_dcg_scores = compute_model_dcg(merged_new_df)

In [52]:
model_dcg_scores

[2.564149206828139,
 3.467947307772609,
 3.0,
 1.2890648263178879,
 8.202481715383039,
 3.6975305549778335,
 5.6822103946774,
 0.3010299956639812,
 0.6343633289973145,
 0.0,
 4.695552245128133,
 4.632687913646256,
 0.8299729413151111,
 2.2920296742201796,
 4.685608962828831,
 0.0,
 1.52665942644392,
 1.901346723184193,
 0.0,
 6.986381837866431,
 0.8107144632819592,
 6.3543776780819785,
 1.5,
 5.52801466468047,
 3.9815549767942113,
 0.8444078224368586,
 0.0,
 0.3562071871080222,
 2.4296858639348864,
 2.2879486051115805,
 0.0,
 1.7641483191359135,
 5.32846030854929,
 5.591109843586484,
 1.0,
 3.187882802898523,
 1.8973093428578993,
 1.92124707417873,
 5.245172672100417,
 0.5,
 5.208254137500101,
 0.0,
 1.4319597492354388,
 5.4306714399404115,
 3.425584328911591,
 12.621619847393601,
 5.6113479980138035,
 1.4182092338047914,
 1.1202170785147485,
 15.741716818248262,
 1.9380362447161694,
 12.778306352100312,
 0.6309297535714575,
 6.553798305154431,
 5.230418244442347,
 3.427527631486865,
 

In [53]:
import pandas as pd

# Extract unique qids from df1
unique_qids = df1['qid'].unique()

# Create a dataframe to store qids, model_dcg_scores, and ideal_dcg_scores
result_df = pd.DataFrame({'qid': unique_qids})

# Add model_dcg_scores to the result dataframe
result_df['model_dcg_scores'] = model_dcg_scores

# Add ideal_dcg_scores to the result dataframe (assuming you already have them calculated)
result_df['ideal_dcg_scores'] = ideal_dcg_scores


In [54]:
result_df

Unnamed: 0,qid,model_dcg_scores,ideal_dcg_scores
0,10,2.564149,5.861348
1,15,3.467947,9.084362
2,33,3.000000,3.000000
3,34,1.289065,1.630930
4,37,8.202482,9.850383
...,...,...,...
334,1787,0.356207,1.000000
335,1798,1.000000,1.000000
336,1823,2.561606,3.953465
337,1825,0.000000,0.000000


In [55]:
# Remove rows where either model_dcg_scores or ideal_dcg_scores are zero
result_df = result_df[(result_df['model_dcg_scores'] != 0) & (result_df['ideal_dcg_scores'] != 0)]

In [56]:
result_df

Unnamed: 0,qid,model_dcg_scores,ideal_dcg_scores
0,10,2.564149,5.861348
1,15,3.467947,9.084362
2,33,3.000000,3.000000
3,34,1.289065,1.630930
4,37,8.202482,9.850383
...,...,...,...
333,1783,0.778943,2.130930
334,1787,0.356207,1.000000
335,1798,1.000000,1.000000
336,1823,2.561606,3.953465


Average NDCG score across all queries

In [57]:
# Calculate the ratio of model_dcg_scores to ideal_dcg_scores
result_df['ndcg_ratio'] = result_df['model_dcg_scores'] / result_df['ideal_dcg_scores']

# Calculate the average NDCG ratio
avg_ndcg = result_df['ndcg_ratio'].mean()

# Print the average NDCG ratio
print("Average NDCG Ratio:", avg_ndcg)


Average NDCG Ratio: 0.5619749915663868


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['ndcg_ratio'] = result_df['model_dcg_scores'] / result_df['ideal_dcg_scores']
