Used to search for files that match a specified pattern.

In [None]:
!pip install pandas
import pandas as pd
import numpy as np
import glob

Xiao Hei Cleaning


In [None]:
# Load the qrels file
qrels_file_path = 'qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None, names=['query_id', 'ignore', 'doc_id', 'relevance'])

#show qrels_dataframe
qrels_df.head()
print(qrels_df.columns)

In [None]:
# Get a list of all CSV files in the input directory
input_files = glob.glob('input/*.csv')

# Create an empty list to store the input files dataframes
input_dfs = []

# Loop through each input CSV file
for filename in input_files:
    # Read the input CSV file into a dataframe
    df = pd.read_csv(filename, sep='\t', header=None)
    # Append the dataframe to the list of dataframes
    input_dfs.append(df)
    
raw_data = pd.concat(input_dfs, ignore_index=True)
print('Shape of raw_data:', raw_data.shape)

############################################## Data cleaning ##############################################

# Loop through each input dataframe and filter out rows where does not meet the clean up requirement
# 1. data_id  is not present in qrels_data_ids
# 2. relevant_score equal to 0
# 3. duplicate ranking

# Extract the valid values from the third column of qrels_df
qrels_data_ids = set(qrels_df.iloc[:, 2])

cleaned_dfs = []

for df in input_dfs:
    # # # Get rows where the data id is present in the qrels data ids
    # # df_with_present_data_id = df.iloc[:, 2].isin(qrels_data_ids)
    
    # # # Get rows where relevant_score does not equal 0
    # # df_with_relevant_score_not_zero = df.iloc[:, 4] != 0
    
    # # Combine targeted rows for first and second rule
    # cleaned_df = df[df_with_present_data_id & df_with_relevant_score_not_zero]

    # Get rows where ranking is not duplicated
    cleaned_df = df
    cleaned_dfs.append(cleaned_df)

# Print the shape of the cleaned data
print("this is input data:", cleaned_dfs)

In [None]:
# Define the column names
columns = ['query_id', 'ignore', 'doc_id', 'rank', 'score', 'ignore2']

# Create an empty list to store the cleaned dataframes
cleaned_dfs_with_names = []

# Iterate over each cleaned dataframe in cleaned_dfs
for df in cleaned_dfs:
    # Rename the columns
    df = df.rename(columns=dict(enumerate(columns)))
    # Append the renamed dataframe to the new list
    cleaned_dfs_with_names.append(df)

In [None]:
print(cleaned_dfs_with_names)

In [None]:
# loop over the dataframes and print the first and third columns
for i, cleaned_df in enumerate(cleaned_dfs_with_names):
    print(f"Columns of dataframe {i+1}:")
    print("First column:", cleaned_df.iloc[:, 0])
    print("Third column:", cleaned_df.iloc[:, 2])

Data Analysis Part 

In [None]:
merged_df = []
for df in cleaned_dfs_with_names:
    cleaned_df_with_qrel = pd.merge(df, qrels_df, on=['query_id','doc_id'], how='left')
    cleaned_df_with_qrel['relevance'] = cleaned_df_with_qrel['relevance'].fillna(0)
    merged_df.append(cleaned_df_with_qrel)
    print(cleaned_df_with_qrel.shape)

In [None]:
print(merged_df)

In [None]:
calculate_metrics_precisions = []
calculate_metrics_avg_precisions = []
temp = 0

for df in merged_df:
    freq = df['relevance'].value_counts()[1]
    temp = temp + freq

for index, df in enumerate(merged_df):
    # group by query_id
    grouped = df.groupby('query_id')

    # calculate precision for top 10
    precisions = grouped.apply(lambda x: np.sum(x['relevance'].head(10)) / 10)
    calculate_metrics_precisions.append(precisions)

    # calculate average precision
    avg_precisions = grouped.apply(lambda x: np.nan_to_num(np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']), nan=0.00000))
    calculate_metrics_avg_precisions.append(avg_precisions)


calculate_metrics_precisions = []
calculate_metrics_avg_precisions = []
for df in merged_df:
    # group by query_id
    grouped = df.groupby('query_id')

    # calculate precision for top 10
    precisions = grouped.apply(lambda x: np.sum(x['relevance'].head(10)) / 10)
    calculate_metrics_precisions.append(precisions)

    # calculate average precision
    avg_precisions = grouped.apply(lambda x: np.sum(x['relevance'] * np.cumsum(x['relevance']) / np.arange(1, len(x['relevance']) + 1)) / np.sum(x['relevance']))
    calculate_metrics_avg_precisions.append(avg_precisions)


In [None]:
print("calculate_metrics_precisions",calculate_metrics_precisions)
print("calculate_metrics_avg_pr",calculate_metrics_avg_precisions)

In [None]:
# assuming the lists have the same length
results_df = pd.concat([pd.DataFrame(calculate_metrics_precisions).T],  axis=1)
results_df.columns = ["Precision@10_" + str(i) for i in range(15)] 

# calculate the overall precision@10
overall_precision = results_df.mean(axis=1)

# add the new column to the DataFrame
results_df['Overall Precision@10'] = overall_precision

# calculate the overall row precision@10
overall_precision_row = results_df.mean(axis=0)

# add the new row to the DataFrame
results_df.loc["Overall"] = overall_precision_row

# show the new table
print(results_df)
results_df.to_csv('resultsT10.csv')

In [None]:
results_df = pd.concat([pd.DataFrame(calculate_metrics_avg_precisions).T],  axis=1)
results_df.columns = ["MAP_" + str(i) for i in range(15)] 


# calculate the overall precision@10
overall_precision = results_df.mean(axis=1)

# add the new column to the DataFrame
results_df['Overall Precision@10'] = overall_precision

# calculate the overall row precision@10
overall_precision_row = results_df.mean(axis=0)

# add the new row to the DataFrame
results_df.loc["Overall"] = overall_precision_row

# show the new table
print(results_df)
results_df.to_csv('resultsMAP.csv')