In [None]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import math
import numpy as np

In [54]:
def stem_text(text, ps):
    words = word_tokenize(text)
    stemmed_words = [ps.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [55]:
text_map = {}

ps = PorterStemmer()

def parse_file(file_path):
    with open(file_path, 'r') as file_object:
        current_docno = None
        current_text = ""
        text_body = False
        for line in file_object:
            docno_match = re.search(r'<DOCNO>(.*?)</DOCNO>', line)
            if docno_match:
                current_docno = docno_match.group(1).strip()

            # Find TEXT
            text_match_start = re.search(r'<TEXT>', line)
            text_match_end = re.search(r'</TEXT>', line)
            if text_match_start:
                text_body = True
                continue
            elif text_match_end:
                text_body = False
                
            if text_body:
                current_text+= line.strip()
            
            # Check if both DOCNO and TEXT are found
            if current_docno and current_text != "" and text_body == False:
                text_map[current_docno] = stem_text(current_text,ps)
                # Reset for the next document
                current_docno = None
                current_text = ""
            
    

In [56]:
for filename in os.listdir('AP_DATA/ap89_collection'):
    file_path = os.path.join('AP_DATA/ap89_collection', filename)
    parse_file(file_path)



In [57]:
len(text_map)

84676

In [28]:
docnos = list(text_map.keys())
docnos[0]

'AP890101-0001'

In [30]:
sw_path = 'config/stoplist.txt'

with open(sw_path) as file:
    stopwords = [line.strip() for line in file]
    

In [31]:
len(stopwords)

418

In [37]:
import string
def process_content(text):
    words = word_tokenize(text)

    filtered_words = [word for word in words if word.lower() not in stopwords]

    filtered_words = [word for word in filtered_words if word not in string.punctuation]

    clean_text = ' '.join(filtered_words)

    return clean_text

In [38]:
for key,val in zip(text_map.keys(), text_map.values()):
    text_map[key] = process_content(val)

In [39]:
len(text_map)

84678

In [10]:
from elasticsearch import Elasticsearch

In [44]:
es = Elasticsearch("http://localhost:9200")
print(es.ping())

True


In [70]:
index_name = "ap89_data1"

configurations = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords_path": "my_stoplist.txt"
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            }
        }
    }
}

In [71]:
es.indices.create(index=index_name, body=configurations)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ap89_data1'}

In [72]:
def add_data(_id, text):
    es.index(index=index_name, body={'content': text}, id=_id)

In [73]:
for key in text_map:
    add_data(key, text_map[key])
    
print("All documents have been added to the index")

All documents have been added to the index


In [1]:
manual_query = {'85': 'alleg corrupt public offici government jurisdict',
                '59': 'weather caus fatal',
                '71': 'prime lend rate',
                '64': 'hostage',
                '62': "militari coup d'etat",
                '93': 'nation rifl associ nra',
                '99': 'iran contra',
                '58': 'rail strike',
                '77': 'poach wildlif',
                '54': 'contract agreement reserv launch commerci satellit',
                '87': 'current crimin action offic fail u.s financi institut',
                '94': 'crime comput',
                '100': 'communist industri state regul transfer high tech good technolog',
                '89': 'invest opec member state downstream oper',
                '61': 'israel iran contra',
                '95': 'comput crime solv',
                '68': 'studi concern safeti manufactur employe instal worker fine diamet fiber insul',
                '57': 'mci bell',
                '97': 'instanc fiber optic technolog',
                '98': 'fiber optic equip',
                '60': 'controversi standard perform determin salari level incent pay contrast determin basi senior longev job',
                '80': '1988 presidenti',
                '63': 'machin translat',
                '91': 'acquisit weapon'
}

In [45]:
def ES_search(query):
    
    search_query = {
    "query": {
        "match": {
            "content": query
        }
    }
}
    res_es_search = es.search(index='ap89_data1', body=search_query, size=1000)
    return res_es_search

In [296]:
for query in manual_query.keys():
    res = ES_search(manual_query[query])['hits']['hits'][:1000]
    with open('query_result_es_builtin.txt','a') as f:
        for i,hit in enumerate(res):  
            res_string = query + " " + 'Q0' + " " + hit['_id'] + " " + str(i) + " " + str(hit['_score']) + " " + "Exp" + '\n'
            f.write(res_string)

In [58]:
vector_map = {}

def get_term_vectors(doc_id):
    term_vector_request = {
        "index": "ap89_data1",
        "id" : doc_id,
        "doc_type": "_doc",
        "fields": ["content"],
        "term_statistics": True}
    vector_map[doc_id] = es.termvectors(**term_vector_request)['term_vectors']
        
        

In [85]:
def get_avg_doc_len():
    total_words = vector_map['AP890306-0069']['content']['field_statistics']['sum_ttf']
    return total_words / 84675    

In [87]:
avg_doc_len = get_avg_doc_len()

In [254]:
query_term_freqs = {}
for doc in text_map.keys():
    query_term_freqs[doc] = {}
    for query in manual_query.keys():
        query_term_freqs[doc][query] = []
        query_term_dfw[doc][query] = []
        for word in manual_query[query].split():
            query_term_freqs[doc][query].append(get_term_freq(word,doc))

In [300]:
def get_term_freq(term, doc):
    if vector_map[doc] == {}:
        return 0
    terms = vector_map[doc]['content']['terms']
    ##print(terms)
    if term in terms.keys():
        return terms[term]['term_freq']
    else:
        return 0
def get_doc_len(doc):
    if vector_map[doc] == {}:
        return 0
    doc_terms = vector_map[doc]['content']['terms']
    doc_len = 0
    for t in doc_terms.keys():
        doc_len+= doc_terms[t]['term_freq']
    return doc_len

def get_dfw(term, doc):
    if vector_map[doc] == {}:
        return 1
    terms = vector_map[doc]['content']['terms']
    if term in terms.keys():
        return terms[term]['term_freq']
    else:
        return 1

def get_vocab_size():
    vocab = []
    for doc in text_map.keys():
        doc_terms = vector_map[doc]['content']['terms']
        for term in doc_terms.keys():
            if term not in vocab:
                vocab.append(term)
                
    return len(vocab)

def get_cfw(term, doc):
    if vector_map[doc] == {}:
        return 1
    terms = vector_map[doc]['content']['terms']
    if term in terms.keys():
        return terms[term]['ttf']
    else:
        return 1

In [288]:
aggregation_request = {
        "aggs": {
            "vocabulary_size": {
                "cardinality": {
                    "field": 'content'
                }
            }
        }
    }

    # Perform the search request with the aggregation
search_request = {
       "query": {
         "match_all": {}
        },
        "size": 0,
        **aggregation_request
    }

    # Execute the search request
search_results = es.search(index='ap89_data1', body=search_request)

    # Extract the vocabulary size from the aggregation response
vocabulary_size = search_results['aggregations']['vocabulary_size']['value']
    
V = vocabulary_size

In [289]:
print(V)

1404886


In [257]:
def okapi_tf(query):
    scores = []
    query_list = manual_query[query].split()
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        total_score = 0
        for word in query_list:
            tf_wd = get_term_freq(word,doc)
            score = tf_wd / (tf_wd + 0.5 + 1.5*(doc_len / avg_doc_len))
            total_score+=score
        if total_score > 0:    
            scores.append((doc, total_score))
    return scores

In [261]:
def tf_idf(query):
    scores = []
    query_list = manual_query[query].split()
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        total_score = 0
        for word in query_list:
            tf_wd = get_term_freq(word,doc)
            score = (tf_wd / (tf_wd + 0.5 + 1.5*(doc_len / avg_doc_len))) * math.log(84676/get_dfw(word,doc))
            total_score+=score
        if total_score > 0:    
            scores.append((doc, total_score))
    return scores

In [274]:
def okapi_bm25(query):
    k1 = 1.2
    b = 0.75
    k2 = 100
    scores = []
    query_list = manual_query[query].split()
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        total_score = 0
        for word in query_list:
            tf_wd = get_term_freq(word,doc)
            tf_wq = query_list.count(word)
            first_term = math.log((84676 + 0.5) / (get_dfw(word,doc)+0.5))
           
            second_term = (tf_wd + k1*tf_wd) / (tf_wd + k1*((1-b) + b*doc_len/avg_doc_len))
           
            third_term = (tf_wq + k2*tf_wq) / (tf_wq + k2)
            
            score = first_term * second_term * third_term
            total_score+=score
        if total_score > 0:    
            scores.append((doc, total_score))
    return scores

In [277]:
def lm_laplace(query):
    scores = []
    query_list = manual_query[query].split()
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        terms = query_term_freqs[doc][query] 
        total_score = 0
        for word in query_list:
            tf_wd = get_term_freq(word,doc)
            score = math.log((tf_wd+1) / (doc_len + V))
            total_score+=score
        if total_score > 0:    
            scores.append((doc, total_score))
    return scores

In [264]:
def lm_jm(query):
    l = 0.5
    scores = []
    query_list = manual_query[query].split()
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        query_list = manual_query[query].split() 
        total_score = 0
        for word in query_list:
            tf_wd = get_term_freq(word,doc)
            score = l*(tf_wd/doc_len) + (1-l)*(get_cfw(word,doc) / (V))
            total_score+=score
        if total_score > 0:    
            scores.append((doc, total_score))
    return scores

    

In [149]:
res_list = []
for doc in text_map.keys():
    ##print(okapi_tf(manual_query['85'], doc))
    res_list.append(okapi_tf(manual_query['85'], doc))
res_list.sort(reverse=True)
print(res_list[0])

2.455452538256885


In [137]:
def process_res(result):
    for rank, i  in enumerate(result['hits']['hits']):
        score = i['_score']
        doc_id = i['_id']
        print(score, doc_id, rank)

{}

In [227]:
def process_model(model, query, filename):
    ##query_string = manual_query[query]
    results = model(query)
    #for doc in text_map.keys():
        #model_out = model(query,doc)
        #if model_out[1] > 0:
            #results.append(model_out)
            
            
    results.sort(key=lambda a: a[1], reverse=True)
    results = results[:1000]
    
    with open(filename,'a') as f:
        for i in range(len(results)):  
            res_string = query + " " + 'Q0' + " " + results[i][0] + " " + str(i) + " " + str(results[i][1]) + " " + "Exp" + '\n'
            f.write(res_string)
    
    
    
    

In [305]:
##OkapiTF
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(okapi_tf, query_num, 'query_result_okapitf.txt')

Query num  85
Query num  59
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91


In [304]:
##TFIDF
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(tf_idf, query_num, 'query_result_tfidf.txt')

Query num  85
Query num  59
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91


In [303]:
##BM25
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(okapi_bm25, query_num, 'query_result_bm25.txt')

Query num  85
Query num  59
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91


In [302]:
##LM Laplace
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(lm_laplace, query_num, 'query_result_lmlaplace.txt')

Query num  85
Query num  59
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91


In [301]:
##LM JM
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(lm_jm, query_num, 'query_result_lmjm.txt')

Query num  85
Query num  59
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91
