Used to search for files that match a specified pattern.

In [2]:
!pip install pandas
import pandas as pd
import numpy as np
import glob



In [3]:
# import qrels csv file
qrels_file_path = 'qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None)

# Get a list of all CSV files in the input directory
input_files = glob.glob('input/*.csv')

# Create an empty list to store the input files dataframes
input_dfs = []

# Loop through each input CSV file
for filename in input_files:
    # Read the input CSV file into a dataframe
    df = pd.read_csv(filename, sep='\t', header=None)
    # Append the dataframe to the list of dataframes
    input_dfs.append(df)
    
raw_data = pd.concat(input_dfs, ignore_index=True)
print('Shape of raw_data:', raw_data.shape)

############################################## Data cleaning ##############################################

# Loop through each input dataframe and filter out rows where does not meet the clean up requirement
# 1. data_id  is not present in qrels_data_ids
# 2. relevant_score equal to 0
# 3. duplicate ranking

# Extract the valid values from the third column of qrels_df
qrels_data_ids = set(qrels_df.iloc[:, 2])

cleaned_dfs = []

for df in input_dfs:
    # Get rows where the data id is present in the qrels data ids
    df_with_present_data_id = df.iloc[:, 2].isin(qrels_data_ids)
    
    # Get rows where relevant_score does not equal 0
    df_with_relevant_score_not_zero = df.iloc[:, 4] != 0
    
    # Combine targeted rows for first and second rule
    cleaned_df = df[df_with_present_data_id & df_with_relevant_score_not_zero]

    # Get rows where ranking is not duplicated
    cleaned_df = cleaned_df.drop_duplicates(subset=[4])
    cleaned_dfs.append(cleaned_df)

# Concatenate all cleaned dataframes into a single dataframe
cleaned_data = pd.concat(cleaned_dfs, ignore_index=True)

# Print the shape of the cleaned data
print('Shape of cleaned_data:', cleaned_data.shape)

Shape of raw_data: (689266, 6)
Shape of cleaned_data: (290603, 6)


In [4]:
cleaned_data.head()

Unnamed: 0,0,1,2,3,4,5
0,401,Q0,FT924-5091,0,10015.175781,acsys8aln2
1,401,Q0,FBIS3-39240,1,10014.111328,acsys8aln2
2,401,Q0,FT924-4470,2,10013.638672,acsys8aln2
3,401,Q0,FBIS4-18182,3,10013.591797,acsys8aln2
4,401,Q0,FBIS3-59055,4,10013.441406,acsys8aln2


QRELS check


In [5]:
import pandas as pd
import numpy as np

# Load the qrels file
qrels_file_path = 'qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None, names=['query_id', 'ignore', 'doc_id', 'relevance'])

#show qrels_dataframe
qrels_df.head()

Unnamed: 0,query_id,ignore,doc_id,relevance
0,401,0,FBIS3-10009,0
1,401,0,FBIS3-10059,0
2,401,0,FBIS3-10142,0
3,401,0,FBIS3-1026,0
4,401,0,FBIS3-10502,0


In [6]:
# Get a list of all input files in the input directory
input_files = glob.glob('input/*.csv')

#check the input file list
print(input_files)

['input\\input.acsys8aln2.csv', 'input\\input.apl8c621.csv', 'input\\input.att99ate.csv', 'input\\input.disco1.csv', 'input\\input.Flab8atd2..csv', 'input\\input.GE8ATD3..csv', 'input\\input.ibms99a.csv', 'input\\input.INQ601.csv', 'input\\input.isa50t.csv', 'input\\input.mds08a3.csv', 'input\\input.MITSLStd.csv', 'input\\input.ok8alx.csv', 'input\\input.pir9Atd0.csv', 'input\\input.READWARE.csv', 'input\\input.Sab8A1.csv']


In [7]:
# Create an empty dictionary to store the system scores
system_scores = {}

# Loop through each input file
for filename in input_files:
    # Load the input file
    input_df = pd.read_csv(filename, sep='\t', header=None, names=['query_id', 'ignore', 'doc_id', 'rank', 'score', 'ignore2']) 
    input_df.head()

    # Merge the input file with the qrels to get the relevance scores
    merged_df = pd.merge(input_df, qrels_df, on=['query_id', 'doc_id'], how='left')
    merged_df.head()

    # Compute precision@10 and MAP per query
    grouped = merged_df.groupby('query_id')
    precisions = grouped.apply(lambda x: np.sum(x['relevance'].iloc[:10]) / 10)
    avg_precisions = grouped.apply(lambda x: np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']))

    # Store the scores for this system
    system_name = filename.split('/')[-1].split('.')[0]
    system_scores[system_name] = pd.DataFrame({'precision@10': precisions, 'MAP': avg_precisions})

  avg_precisions = grouped.apply(lambda x: np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']))


In [54]:
# Merge the scores for all systems into a single dataframe
all_system_scores = pd.concat(system_scores.values(), keys=system_scores.keys(), axis=1)

# Print the scores for all systems
print(all_system_scores)

          input\input          
         precision@10       MAP
query_id                       
401               0.2  0.133187
402               0.5  0.240247
403               0.6  0.737960
404               0.2  0.206948
405               0.3  0.164085
406               0.5  0.438768
407               0.8  0.502876
408               0.3  0.242085
409               0.1  0.050198
410               1.0  0.874088
411               0.4  0.290586
412               0.9  0.577200
413               0.0  0.058481
414               0.4  0.221919
415               1.0  0.542335
416               0.5  0.334489
417               0.5  0.198626
418               0.7  0.419000
419               0.1  0.038250
420               0.8  0.488695
421               0.2  0.137775
422               0.8  0.364895
423               0.7  0.465734
424               0.2  0.200787
425               0.9  0.534200
426               0.4  0.147631
427               0.4  0.265351
428               0.4  0.352502
429     

import pandas as pd
import numpy as np

# Load the qrels file
qrels_file_path = 'qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None, names=['query_id', 'ignore', 'doc_id', 'relevance'])

# Get a list of all input files in the input directory
input_files = glob.glob('input/*.csv')

# Create an empty dictionary to store the system scores
system_scores = {}

# Loop through each input file
for filename in input_files:
    # Load the input file
    input_df = pd.read_csv(filename, sep='\t', header=None, names=['query_id', 'ignore', 'doc_id', 'rank', 'score', 'ignore2'])

    # Merge the input file with the qrels to get the relevance scores
    merged_df = pd.merge(input_df, qrels_df, on=['query_id', 'doc_id'], how='left')

    # Compute precision@10 and MAP per query
    grouped = merged_df.groupby('query_id')
    precisions = grouped.apply(lambda x: np.sum(x['relevance'].iloc[:10]) / 10)
    avg_precisions = grouped.apply(lambda x: np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']))

    # Store the scores for this system
    system_name = filename.split('/')[-1].split('.')[0]
    system_scores[system_name] = pd.DataFrame({'precision@10': precisions, 'MAP': avg_precisions})

# Merge the scores for all systems into a single dataframe
all_system_scores = pd.concat(system_scores.values(), keys=system_scores.keys(), axis=1)

# Print the scores for all systems
print(all_system_scores)



In [10]:
import pandas as pd
import numpy as np
import glob

# Load the qrels file
qrels_file_path = 'qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None, names=['query_id', 'ignore', 'doc_id', 'relevance'])

# Get a list of all input files in the input directory
input_files = glob.glob('input/*.csv')

# Create an empty dictionary to store the system scores
system_scores = {}

# Loop through each input file
for filename in input_files:
    # Load the input file
    input_df = pd.read_csv(filename, sep='\t', header=None, names=['query_id', 'ignore', 'doc_id', 'rank', 'score', 'ignore2'])

    # Merge the input file with the qrels to get the relevance scores
    merged_df = pd.merge(input_df, qrels_df, on=['query_id', 'doc_id'], how='left')

    # Compute precision@10 and MAP per query
    grouped = merged_df.groupby('query_id')
    precisions = grouped.apply(lambda x: np.sum(x['relevance'].iloc[:10]) / 10)
    avg_precisions = grouped.apply(lambda x: np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']))

    # Compute the overall precision@10 and MAP scores for this system
    system_name = filename.split('/')[-1].split('.')[0]
    overall_precision = precisions.mean()
    overall_map = avg_precisions.mean()
    system_scores[system_name] = {'precision@10': overall_precision, 'MAP': overall_map}

# Convert the system scores dictionary into a dataframe
all_system_scores = pd.DataFrame(system_scores).T

# Print the scores for all systems
print(all_system_scores)


  avg_precisions = grouped.apply(lambda x: np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']))


                  MAP  precision@10
input\input  0.315698         0.478


In [9]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, ttest_rel

# Load the qrels file
qrels_file_path = 'qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None, names=['query_id', 'ignore', 'doc_id', 'relevance'])

# Get a list of all input files in the input directory
input_files = glob.glob('input/*.csv')

# Create an empty dictionary to store the system scores
system_scores = {}

# Loop through each input file
for filename in input_files:
    # Load the input file
    input_df = pd.read_csv(filename, sep='\t', header=None, names=['query_id', 'ignore', 'doc_id', 'rank', 'score', 'ignore2'])

    # Merge the input file with the qrels to get the relevance scores
    merged_df = pd.merge(input_df, qrels_df, on=['query_id', 'doc_id'], how='left')

    # Compute precision@10 and MAP per query
    grouped = merged_df.groupby('query_id')
    precisions = grouped.apply(lambda x: np.sum(x['relevance'].iloc[:10]) / 10)
    avg_precisions = grouped.apply(lambda x: np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']))

    # Compute the overall precision@10 and MAP scores for this system
    system_name = filename.split('/')[-1].split('.')[0]
    overall_precision = precisions.mean()
    overall_map = avg_precisions.mean()
    system_scores[system_name] = {'precision@10': overall_precision, 'MAP': overall_map}

# Convert the system scores dictionary into a dataframe
all_system_scores = pd.DataFrame(system_scores).T

# Compute correlation coefficient between precision@10 and MAP
corr_coef, p_value = pearsonr(all_system_scores['precision@10'], all_system_scores['MAP'])
print(f'Correlation coefficient: {corr_coef:.2f}, p-value: {p_value:.2f}')

# Compute significance testing using paired t-test
p_value = ttest_rel(all_system_scores['precision@10'], all_system_scores['MAP'])[1]
print(f'Paired t-test p-value: {p_value:.2f}')


  avg_precisions = grouped.apply(lambda x: np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']))


ValueError: x and y must have length at least 2.