In [20]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

In [24]:
def stem_text(text, ps):
    words = word_tokenize(text)
    stemmed_words = [ps.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [66]:
text_map = {}

ps = PorterStemmer()

def parse_file(file_path):
    with open(file_path, 'r') as file_object:
        current_docno = None
        current_text = ""
        text_body = False
        for line in file_object:
            docno_match = re.search(r'<DOCNO>(.*?)</DOCNO>', line)
            if docno_match:
                current_docno = docno_match.group(1).strip()

            # Find TEXT
            text_match_start = re.search(r'<TEXT>', line)
            text_match_end = re.search(r'</TEXT>', line)
            if text_match_start:
                text_body = True
                continue
            elif text_match_end:
                text_body = False
                
            if text_body:
                current_text+= line.strip()
            
            # Check if both DOCNO and TEXT are found
            if current_docno and current_text != "" and text_body == False:
                text_map[current_docno] = stem_text(current_text,ps)
                # Reset for the next document
                current_docno = None
                current_text = ""
            
    

In [67]:
for filename in os.listdir('AP_DATA/ap89_collection'):
    file_path = os.path.join('AP_DATA/ap89_collection', filename)
    parse_file(file_path)



In [68]:
len(text_map)

84676

In [28]:
docnos = list(text_map.keys())
docnos[0]

'AP890101-0001'

In [30]:
sw_path = 'config/stoplist.txt'

with open(sw_path) as file:
    stopwords = [line.strip() for line in file]
    

In [31]:
len(stopwords)

418

In [37]:
import string
def process_content(text):
    words = word_tokenize(text)

    filtered_words = [word for word in words if word.lower() not in stopwords]

    filtered_words = [word for word in filtered_words if word not in string.punctuation]

    clean_text = ' '.join(filtered_words)

    return clean_text

In [38]:
for key,val in zip(text_map.keys(), text_map.values()):
    text_map[key] = process_content(val)

In [39]:
len(text_map)

84678

In [43]:
from elasticsearch import Elasticsearch

In [69]:
es = Elasticsearch("http://localhost:9200")
print(es.ping())

True


In [70]:
index_name = "ap89_data1"

configurations = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords_path": "my_stoplist.txt"
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            }
        }
    }
}

In [71]:
es.indices.create(index=index_name, body=configurations)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ap89_data1'}

In [72]:
def add_data(_id, text):
    es.index(index=index_name, body={'content': text}, id=_id)

In [None]:
for key in text_map:
    add_data(key, text_map[key])
    
print("All documents have been added to the index")