In [1]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import math
import numpy as np

In [331]:
def stem_text(text, ps):
    words = word_tokenize(text)
    stemmed_words = [ps.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [332]:
text_map = {}

ps = PorterStemmer()

def parse_file(file_path):
    with open(file_path, 'r') as file_object:
        current_docno = None
        current_text = ""
        text_body = False
        for line in file_object:
            docno_match = re.search(r'<DOCNO>(.*?)</DOCNO>', line)
            if docno_match:
                current_docno = docno_match.group(1).strip()

            # Find TEXT
            text_match_start = re.search(r'<TEXT>', line)
            text_match_end = re.search(r'</TEXT>', line)
            if text_match_start:
                text_body = True
                continue
            elif text_match_end:
                text_body = False
                
            if text_body:
                current_text+= line.strip()
            
            # Check if both DOCNO and TEXT are found
            if re.search(r'</DOC>', line):
                text_map[current_docno] = stem_text(current_text,ps)
                # Reset for the next document
                current_text = ""
            
    

In [333]:
for filename in os.listdir('AP_DATA/ap89_collection'):
    file_path = os.path.join('AP_DATA/ap89_collection', filename)
    parse_file(file_path)



In [334]:
len(text_map)

84678

In [335]:
sw_path = 'stoplist.txt'

with open(sw_path) as file:
    stopwords = [line.strip() for line in file]
    

In [336]:
len(stopwords)

495

In [337]:
import string
def process_content(text):
    words = word_tokenize(text)
    
    filtered_words = [word for word in words if word.lower() not in stopwords]

    filtered_words = [word for word in filtered_words if (word not in string.punctuation)]
    
    clean_text = ' '.join(filtered_words)
    
    return clean_text

In [338]:
for key,val in zip(text_map.keys(), text_map.values()):
    text_map[key] = process_content(val)

In [339]:
len(text_map)

84678

In [76]:
from elasticsearch7 import Elasticsearch

In [75]:
!pip install elasticsearch7

Collecting elasticsearch7
  Downloading elasticsearch7-7.17.9-py2.py3-none-any.whl (386 kB)
     ---------------------------------------- 0.0/386.4 kB ? eta -:--:--
     ----------- -------------------------- 112.6/386.4 kB 2.2 MB/s eta 0:00:01
     ------------------------------------ - 368.6/386.4 kB 3.8 MB/s eta 0:00:01
     -------------------------------------- 386.4/386.4 kB 3.4 MB/s eta 0:00:00
Installing collected packages: elasticsearch7
Successfully installed elasticsearch7-7.17.9


In [79]:
es = Elasticsearch("http://localhost:9200")
print(es.ping())

True


In [340]:
index_name = "ap89_data_final"

configurations = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": stopwords
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            }
        }
    }
}

In [96]:
es.indices.create(index=index_name, body=configurations)

  es.indices.create(index=index_name, body=configurations)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ap89_data'}

In [341]:
def add_data(_id, text):
    es.index(index=index_name, body={'content': text}, id=_id)

In [342]:
for key in text_map:
    add_data(key, text_map[key])
    
print("All documents have been added to the index")

  es.index(index=index_name, body={'content': text}, id=_id)


All documents have been added to the index


In [620]:
manual_query = {'85': 'corrupt offici government',
                '59': 'weather dead',
                '56' : 'prime lend rate',
                '71': 'incursion border military guerilla',
                '64': 'hostage',
                '62': "militari coup d'etat",
                '93': 'nation rifl associ nra',
                '99': 'development iran contra',
                '58': 'rail strike',
                '77': 'poach',
                '54': 'commerci launch satellit',
                '87': 'criminal current action bank failed officer',
                '94': 'computer crime illegal',
                '100': 'high-tech dual-use regulate transfer technolog',
                '89': 'invest opec member downstream oper',
                '61': 'israel iran contra',
                '95': 'computer crime solv law',
                '68': 'safeti installation employe fine-diamet insulation',
                '57': 'mci bell',
                '97': 'fiber optic',
                '98': 'fiber optic equip',
                '60': 'pay senior',
                '80': 'presidential bush dukakis',
                '63': 'translation',
                '91': 'weapon us army'
}

for q in manual_query.keys():
    manual_query[q] = process_content(stem_text(manual_query[q],ps))
    print(q,manual_query[q])

85 corrupt offici govern
59 weather dead
56 prime lend rate
71 incurs border militari guerilla
64 hostag
62 militari coup
93 nation rifl associ nra
99 develop iran contra
58 rail strike
77 poach
54 commerci launch satellit
87 crimin current action bank fail offic
94 comput crime illeg
100 high-tech dual-us regul transfer technolog
89 invest opec downstream oper
61 israel iran contra
95 comput crime solv law
68 safeti instal employ fine-diamet insul
57 mci bell
97 fiber optic
98 fiber optic equip
60 pay senior
80 presidenti bush dukaki
63 translat
91 weapon armi


In [343]:
def ES_search(query_num):
    
    query_string = manual_query[query_num]
    print(query_num, query_string)
    
#     search_query = {
#     "query": {
#         "match": {
#             "content": query_string
#         }
#     }
# }
    res_es_search = es.search(index='ap89_data_final', query={'match': {'content': query_string}}, size = 1000)
    return res_es_search

In [608]:
##Get scores for ES built in
with open('query_result_es_builtin.txt','w') as f:
        f.write("")
for num in manual_query.keys():
    res = ES_search(num)['hits']['hits']
    with open('query_result_es_builtin.txt','a') as f:
        for i,hit in enumerate(res):  
            res_string = num + " " + 'Q0' + " " + hit['_id'] + " " + str(i+1) + " " + str(hit['_score']) + " " + "Exp" + '\n'
            f.write(res_string)

85 corrupt offici govern
59 weather dead
56 prime lend rate
71 incurs border militari guerilla
64 hostag
62 militari coup
93 nation rifl associ nra
99 develop iran contra
58 rail strike
77 poach
54 commerci launch satellit
87 crimin current action bank fail offic
94 comput crime illeg
100 high-tech dual-us regul transfer technolog
89 invest opec downstream oper
61 israel iran contra
95 comput crime solv law
68 safeti instal employ fine-diamet insul
57 mci bell
97 fiber optic
98 fiber optic equip
60 pay senior
80 presidenti bush dukaki
63 translat
91 weapon armi


In [345]:
vector_map = {}

def get_term_vectors(doc_id):
    term_vector_request = {
        "index": "ap89_data_final",
        "id" : doc_id,
        "doc_type": "_doc",
        "fields": ["content"],
        "term_statistics": True}
    vector_map[doc_id] = es.termvectors(**term_vector_request)['term_vectors']
        
        

In [346]:
for doc in text_map.keys():
    get_term_vectors(doc)

In [347]:
len(vector_map)

84678

In [348]:
def get_avg_doc_len():
    total_words = vector_map['AP890306-0069']['content']['field_statistics']['sum_ttf']
    return total_words / 84678    

In [349]:
avg_doc_len = get_avg_doc_len()

In [350]:
print(avg_doc_len)

251.69524551831645


In [382]:
def get_term_freq(term, doc):
    if vector_map[doc] == {}:
        return 0
    terms = vector_map[doc]['content']['terms']
    
    if term in terms.keys():
        return terms[term]['term_freq']
    else:
        return 0
def get_doc_len(doc):
    if vector_map[doc] == {}:
        return 0
    doc_terms = vector_map[doc]['content']['terms']
    doc_len = 0
    for t in doc_terms.keys():
        doc_len+= doc_terms[t]['term_freq']
    return doc_len

def get_dfw(term):
    
    terms = vector_map[doc]['content']['terms']
    
    query_body = {
    "query": {
        "match": {
            "content": term  
            }
        }
    }
    
    response = es.count(index="ap89_data_final", body=query_body)
    return response['count']


def get_cfw(term, doc):
    if vector_map[doc] == {}:
        return 0
    terms = vector_map[doc]['content']['terms']
    if term in terms.keys():
        return terms[term]['ttf']
    else:
        return 0

In [352]:
vocab_size = set()


for v_key in vector_map.keys():
    if vector_map[v_key] == {}:
        continue
    vocab_size.update(set(vector_map[v_key]['content']['terms'].keys()))

In [353]:
V = len(vocab_size)

In [354]:
print(V)

1409243


In [602]:
def okapi_tf(query_key):
    scores = []
    query_list = manual_query[query_key].split()
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        total_score = 0
        for word in query_list:
            tf_wd = get_term_freq(word,doc)
            score = tf_wd / (tf_wd + 0.5 + 1.5*(doc_len / avg_doc_len))
            total_score+=score
        if total_score > 0:    
            scores.append((doc, total_score))
    return scores

In [603]:
def tf_idf(query_key):
    scores = []
    query_list = manual_query[query_key].split()
    dfw_list = [get_dfw(word) for word in query_list]
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        total_score = 0
        for word,dfw in zip(query_list,dfw_list):
            tf_wd = get_term_freq(word,doc)
            score = (tf_wd / (tf_wd + 0.5 + 1.5*(doc_len / avg_doc_len))) * math.log(84678/dfw)
            total_score+=score
        if total_score > 0:    
            scores.append((doc, total_score))
    return scores

In [604]:
def okapi_bm25(query_key):
    k1 = 1.2
    b = 0.75
    k2 = 500
    scores = []
    query_list = manual_query[query_key].split()
    dfw_list = [get_dfw(word) for word in query_list]
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        total_score = 0
        for word, dfw in zip(query_list,dfw_list):
            tf_wd = get_term_freq(word,doc)
            tf_wq = query_list.count(word)
            first_term = math.log((84678 + 0.5) / (dfw+0.5))
           
            second_term = (tf_wd + k1*tf_wd) / (tf_wd + k1*((1-b) + b*doc_len/avg_doc_len))
           
            third_term = (tf_wq + k2*tf_wq) / (tf_wq + k2)
            
            score = first_term * second_term * third_term
            total_score+=score
        if total_score > 0:    
            scores.append((doc, total_score))
    return scores

In [366]:
def lm_laplace(query_key):
    scores = []
    query_list = manual_query[query_key].split()
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        total_score = 0
        for word in query_list:
            tf_wd = get_term_freq(word,doc)
            score = math.log((tf_wd+1) / (doc_len + V))
            total_score+=score
        if total_score != 0:    
            scores.append((doc, total_score))
    return scores

In [625]:
def lm_jm(query_key):
    l = 0.7
    scores = []
    query_list = manual_query[query_key].split()
    for doc in text_map.keys():
        doc_len = get_doc_len(doc)
        total_score = 0
        for word in query_list:
            tf_wd = get_term_freq(word,doc)
            if tf_wd == 0:
                score = -10000
                total_score += score
            else:
                score = (l*(tf_wd/doc_len)) + ((1-l)*(get_cfw(word,doc) / V))
                total_score += math.log(score)
        if total_score != 0:    
            scores.append((doc, total_score))
    return scores

    

In [117]:
def process_model(model, query, filename):
    ##query_string = manual_query[query]
    results = model(query)
    #for doc in text_map.keys():
        #model_out = model(query,doc)
        #if model_out[1] > 0:
            #results.append(model_out)
            
            
    results.sort(key=lambda a: a[1], reverse=True)
    results = results[:1000]
    
    
    with open(filename,'a') as f:
        for i in range(len(results)):  
            res_string = query + " " + 'Q0' + " " + results[i][0] + " " + str(i+1) + " " + str(results[i][1]) + " " + "Exp" + '\n'
            f.write(res_string)
    
    
    
    

In [605]:
##OkapiTF
with open('query_result_okapitf.txt','w') as f:
        f.write("")
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(okapi_tf, query_num, 'query_result_okapitf.txt')

Query num  85
Query num  59
Query num  56
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91


In [606]:
##TFIDF
with open('query_result_tfidf.txt','w') as f:
        f.write("")
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(tf_idf, query_num, 'query_result_tfidf.txt')

Query num  85
Query num  59
Query num  56
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91


In [607]:
##BM25
with open('query_result_bm25.txt','w') as f:
        f.write("")
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(okapi_bm25, query_num, 'query_result_bm25.txt')

Query num  85
Query num  59
Query num  56
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91


In [594]:
##LM Laplace
with open('query_result_lmlaplace.txt','w') as f:
        f.write("")
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(lm_laplace, query_num, 'query_result_lmlaplace.txt')

Query num  85
Query num  59
Query num  56
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91


In [626]:
##LM JM
with open('query_result_lmjm.txt','w') as f:
        f.write("")
for query_num in manual_query.keys():
    print("Query num ", query_num)
    process_model(lm_jm, query_num, 'query_result_lmjm.txt')

Query num  85
Query num  59
Query num  56
Query num  71
Query num  64
Query num  62
Query num  93
Query num  99
Query num  58
Query num  77
Query num  54
Query num  87
Query num  94
Query num  100
Query num  89
Query num  61
Query num  95
Query num  68
Query num  57
Query num  97
Query num  98
Query num  60
Query num  80
Query num  63
Query num  91
