# Extraction

In [1]:
import os as os
import re

# get the current working directory
directory = os.getcwd()

# define paths to CISI dataset files
cisi_all_path = os.path.join(directory, 'CISI.ALL')
cisi_qry_path = os.path.join(directory, 'CISI.QRY')
cisi_rel_path = os.path.join(directory, 'CISI.REL')

### cisi.all extraction

In [2]:
def all_extraction(file_name_path):
    with open(file_name_path) as f:
        file_lines = f.readlines()

    doc_ids = []
    documents = []
    i = 0
    while i < len(file_lines):
        if file_lines[i].startswith(".I"):
            doc_id = file_lines[i].split()[1]
            doc_ids.append(doc_id)
            if file_lines[i+1].startswith(".T") and file_lines[i+3].startswith(".A"):
                title = file_lines[i+2][:].strip()
            else:
                title = file_lines[i+2].strip() + " " + file_lines[i+3].strip()
            authors = ""
            if any([line.startswith(".A") for line in file_lines[i+1:]]):
                author_start = i+4
                author_end = author_start
                while author_end < len(file_lines) and not any([file_lines[author_end].startswith(tag) for tag in [".T", ".B", ".W", ".Y"]]):
                    author_end += 1
                authors = " ".join([file_lines[j].strip() for j in range(author_start, author_end)]).strip()
                authors = re.sub(r"^\s*\.\s*A\s*", "", authors)
                abstract_start = author_end
            else:
                abstract_start = i+4

            for t in range(i+4, len(file_lines)):
                if file_lines[t].startswith(".X"):
                    abstract_end = t
                    break
            else:
                abstract_end = len(file_lines)
            abstract = " ".join(file_lines[abstract_start:abstract_end]).strip().replace('\n', ' ')
            abstract = re.sub(r"^\s*\.\s*W\s*", "", abstract)
            abstract = " ".join(abstract.split())
            source = re.search(r"\.B\s+(.*)\.W", abstract)
            if source:
                source = source.group(1)
                source = re.sub(r"^\s*\.\s*A\s*", "", source)
                source = re.sub(r"^\s*\.\s*B\s*", "", source)
            else:
                source = ""
            abstract = re.sub(r"^\s*\.\s*B\s*(.*)\.W", "", abstract)
            abstract = " ".join(abstract.split())
            year = re.search(r"\.Y\s+(\d{4})", abstract)
            if year:
                year = int(year.group(1))
            else:
                year = ""
            abstract = re.sub(r"^\s*\.\s*Y\s*\d{4}\s*", "", abstract)
            abstract = " ".join(abstract.split())
            documents.append({"document_id": doc_id, "title": title, "authors": authors, "abstract": abstract, "source": source, "year": year})
            i = abstract_end
        else:
            i += 1

    return documents

In [3]:
documents = all_extraction(cisi_all_path)
#documents

### cisi.qry extraction

In [4]:
def qry_extraction(filename_path, tag):
    with open (filename_path,'r') as f:
        all_text = f.read().replace('\n'," ")
        file_lines = re.split(tag,all_text)
        file_lines.pop(0)
        return file_lines

In [5]:
qry_id_tag = re.compile('\.I.')
cisi_qry = qry_extraction(cisi_qry_path, qry_id_tag) 

In [6]:
# extracting query id and query text 
doc_id_tag = re.compile('\.I.')
cisi_qry = qry_extraction(cisi_qry_path, doc_id_tag) 
queries = []
for i, d in enumerate(cisi_qry):
    query_id, query_text = d.split(' ', 1)
    query_text = query_text.split('.W', 1)[1].strip()
    query_text = re.sub(r'\.B\s.*', '', query_text).strip()
    queries.append({'query_id': int(query_id), 'query_text': query_text})

In [16]:
#queries

In [8]:
# extracting query id and publication information '.B'
queries1 = []

for i, d in enumerate(cisi_qry):
    query_id, query_text = d.split(' ', 1)
    
    # Extract publication date if present
    pub_info = re.search(r'\.B\s+(.*)', query_text)
    if pub_info:
        publication = pub_info.group(1)
    else:
        publication = ''
    
    queries1.append({'query_id': int(query_id), 'publication': publication})

In [9]:
#queries1

In [10]:
# extracting query id and author information '.A'
queries2 = []

for i, d in enumerate(cisi_qry):
    query_id, query_text = d.split(' ', 1)
    
    author_info = re.search(r'\.A\s+(.*?)\s*\.W', query_text, re.DOTALL)
    if author_info:
        author = author_info.group(1).strip()
        author = author.replace('\n', ', ')
    else:
        author = ''
    
    queries2.append({'query_id': int(query_id), 'author': author})

In [11]:
#queries2

In [12]:
# extracting query id and title information '.T'
queries3 = []

for i, d in enumerate(cisi_qry):
    query_id, query_text = d.split(' ', 1)
    
    title_info = re.search(r'\.T\s+(.*?)\s*\.A', query_text, re.DOTALL)
    if title_info:
        title = title_info.group(1).strip()
    else:
        title = ''
    
    queries3.append({'query_id': int(query_id), 'title': title})

In [13]:
#queries3

In [14]:
# creating function to merge lists over common key 'query_id'
def merge_lists(x, y, z, w):
    merged_list = []
    for dict_x in x:
        dict_a = {'query_id': dict_x['query_id']}
        for dict_y in y:
            if dict_y['query_id'] == dict_x['query_id']:
                dict_a.update(dict_y)
        for dict_z in z:
            if dict_z['query_id'] == dict_x['query_id']:
                dict_a.update(dict_z)
        for dict_w in w:
            if dict_w['query_id'] == dict_x['query_id']:
                dict_a.update(dict_w)
        merged_list.append(dict_a)
    return merged_list

In [15]:
all_queries = merge_lists(queries1, queries3, queries2, queries) # SO HERE, QUERIES1 CONTAINS QUERY ID AND PUBLICATION INFO, BUT DOESNT PRINT IT OUT....
#all_queries

### cisi.rel extraction

In [38]:
import pandas as pd
rel_df = pd.read_csv('CISI.REL', names=['query_id', 'document_id', 'relevance', 'score'], delimiter='\s+')
print(rel_df.isnull().values.any())

False


In [39]:
rel_df

Unnamed: 0,query_id,document_id,relevance,score
0,1,28,0,0.0
1,1,35,0,0.0
2,1,38,0,0.0
3,1,42,0,0.0
4,1,43,0,0.0
...,...,...,...,...
3109,111,422,0,0.0
3110,111,448,0,0.0
3111,111,485,0,0.0
3112,111,503,0,0.0
