Used to search for files that match a specified pattern.

In [1]:
!pip install pandas
import pandas as pd
import numpy as np
import glob



PW Cleaning

In [2]:
# Define a function to clean the data in a TREC input file
def clean_trec_input_file(filename):
    # Load the data into a pandas dataframe
    df = pd.read_csv(filename, sep='\t', header=None, names=['query_id', 'ignore', 'doc_id', 'rank', 'score', 'ignore2'])

    # Drop any columns that aren't needed
    df.drop(['ignore', 'ignore2'], axis=1, inplace=True)

    # Convert the rank and score columns to numeric data types
    df['rank'] = pd.to_numeric(df['rank'], errors='coerce')
    df['score'] = pd.to_numeric(df['score'], errors='coerce')

    # Drop any rows with missing values
    df.dropna(inplace=True)

    # Reset the index
    df.reset_index(drop=True, inplace=True)

    # Return the cleaned dataframe
    return df

In [3]:
# Define a function to concatenate the cleaned data from multiple TREC input files
def concat_clean_trec_input_files(filenames):
    # Initialize an empty list to store the cleaned dataframes
    dfs = []

    # Loop through each input file
    for filename in filenames:
        # Clean the data in the file
        df = clean_trec_input_file(filename)

        # Append the cleaned dataframe to the list
        dfs.append(df)

    # Concatenate the cleaned dataframes
    concatenated_df = pd.concat(dfs)

    # Reset the index
    concatenated_df.reset_index(drop=True, inplace=True)

    # Return the concatenated dataframe
    return concatenated_df

In [4]:
# Get a list of all CSV files in the input directory
input_files = glob.glob('input/*.csv')

print(input_files)

['input\\input.acsys8aln2.csv', 'input\\input.apl8c621.csv', 'input\\input.att99ate.csv', 'input\\input.disco1.csv', 'input\\input.Flab8atd2..csv', 'input\\input.GE8ATD3..csv', 'input\\input.ibms99a.csv', 'input\\input.INQ601.csv', 'input\\input.isa50t.csv', 'input\\input.mds08a3.csv', 'input\\input.MITSLStd.csv', 'input\\input.ok8alx.csv', 'input\\input.pir9Atd0.csv', 'input\\input.READWARE.csv', 'input\\input.Sab8A1.csv']


In [5]:
# Concatenate and clean the data in the input files
cleaned_data = concat_clean_trec_input_files(input_files)

cleaned_data.head()
print(cleaned_data.shape)

(689266, 4)


In [6]:
# Load the qrels file
qrels_file_path = 'qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None, names=['query_id', 'ignore', 'doc_id', 'relevance'])

#show qrels_dataframe
qrels_df.head()
print(qrels_df.columns)

Index(['query_id', 'ignore', 'doc_id', 'relevance'], dtype='object')


Xiao Hei Cleaning


In [64]:
# Load the qrels file
qrels_file_path = 'qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None, names=['query_id', 'ignore', 'doc_id', 'relevance'])

#show qrels_dataframe
qrels_df.head()
print(qrels_df.columns)

Index(['query_id', 'ignore', 'doc_id', 'relevance'], dtype='object')


In [65]:
# Get a list of all CSV files in the input directory
input_files = glob.glob('input/*.csv')

# Create an empty list to store the input files dataframes
input_dfs = []

# Loop through each input CSV file
for filename in input_files:
    # Read the input CSV file into a dataframe
    df = pd.read_csv(filename, sep='\t', header=None)
    # Append the dataframe to the list of dataframes
    input_dfs.append(df)
    
raw_data = pd.concat(input_dfs, ignore_index=True)
print('Shape of raw_data:', raw_data.shape)

############################################## Data cleaning ##############################################

# Loop through each input dataframe and filter out rows where does not meet the clean up requirement
# 1. data_id  is not present in qrels_data_ids
# 2. relevant_score equal to 0
# 3. duplicate ranking

# Extract the valid values from the third column of qrels_df
qrels_data_ids = set(qrels_df.iloc[:, 2])

cleaned_dfs = []

for df in input_dfs:
    # Get rows where the data id is present in the qrels data ids
    df_with_present_data_id = df.iloc[:, 2].isin(qrels_data_ids)
    
    # Get rows where relevant_score does not equal 0
    df_with_relevant_score_not_zero = df.iloc[:, 4] != 0
    
    # Combine targeted rows for first and second rule
    cleaned_df = df[df_with_present_data_id & df_with_relevant_score_not_zero]

    # Get rows where ranking is not duplicated
    cleaned_df = cleaned_df.drop_duplicates(subset=[4])
    cleaned_dfs.append(cleaned_df)

# Concatenate all cleaned dataframes into a single dataframe
cleaned_data = pd.concat(cleaned_dfs, ignore_index=True)

# add column names to the cleaned dataframe
cleaned_data.columns = ['query_id', 'ignore', 'doc_id', 'rank', 'score', 'ignore2']

# Print the shape of the cleaned data
print('Shape of cleaned_data:', cleaned_data.shape)

Shape of raw_data: (689266, 6)
Shape of cleaned_data: (290603, 6)


In [74]:
print(cleaned_data.columns)
cleaned_data.head()
print(cleaned_data.shape)

Index(['query_id', 'ignore', 'doc_id', 'rank', 'score', 'ignore2'], dtype='object')
(290603, 6)


Data Analysis Part 

In [7]:
# Create an empty dictionary to store the system scores
system_scores = {}

# Merge the input file with the qrels to get the relevance scores
merged_df = pd.merge(cleaned_data, qrels_df, on=['query_id', 'doc_id'], how='left')

merged_df.head()

Unnamed: 0,query_id,doc_id,rank,score,ignore,relevance
0,401,FT924-5091,0,10015.175781,0.0,1.0
1,401,FBIS3-39240,1,10014.111328,0.0,1.0
2,401,FT924-4470,2,10013.638672,0.0,1.0
3,401,FBIS4-18182,3,10013.591797,0.0,0.0
4,401,FBIS3-59055,4,10013.441406,0.0,1.0


In [8]:
# Compute precision@10 and MAP per query
grouped = merged_df.groupby('query_id')
precisions = grouped.apply(lambda x: np.sum(x['relevance'].iloc[:10]) / 10)
avg_precisions = grouped.apply(lambda x: np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']))

In [9]:
merged_df.columns

Index(['query_id', 'doc_id', 'rank', 'score', 'ignore', 'relevance'], dtype='object')

In [10]:
# Create a new DataFrame with the query_id as the index and the precision@10 and MAP as columns
results_df = pd.DataFrame({'precision@10': precisions, 'MAP': avg_precisions}, index=merged_df['query_id'].unique())

# Save the results to a CSV file
results_df.to_csv('results.csv')

results_df.head()

Unnamed: 0,precision@10,MAP
401,0.7,0.1292
402,1.0,0.094294
403,1.0,0.086817
404,0.4,0.12807
405,0.2,0.037844


In [11]:
# Compute the mean precision@10 and mean MAP across all queries
mean_precision = results_df['precision@10'].mean()
mean_map = results_df['MAP'].mean()

# Print the results
print(f"Overall precision@10: {mean_precision:.4f}")
print(f"Overall MAP: {mean_map:.4f}")

Overall precision@10: 0.4780
Overall MAP: 0.0804
