In [1]:
from glob import glob
from nltk.tokenize import RegexpTokenizer
from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup
import nltk
import random

In [2]:
english_words = set(w.lower() for w in nltk.corpus.words.words())
es = Elasticsearch()
tokenizer = RegexpTokenizer(r'\w+')

In [3]:
data_paths = glob("trec07p/data/*")

In [4]:
spam_ham = {}
with open("trec07p/full/index") as file:
    text = file.read().split("\n")[:-1]
    for line in text:
        line = line.split(" ..")
        if line[0] == "spam":
            spam_ham[line[1].split("/")[2]] = True
        else:
            spam_ham[line[1].split("/")[2]] = False

            

In [5]:
len(spam_ham)

75419

In [6]:
twenty_perct = len(data_paths) * 20 / 100

In [7]:
data_partition = {}
test_set = random.sample(range(0, len(data_paths)), int(twenty_perct))
for i in range(len(data_paths)):
    if i in test_set:
        data_partition[data_paths[i]] = False
    else:
        data_partition[data_paths[i]] = True

In [8]:
request_body = {
   "settings": {
      "analysis": {
         "analyzer": {
            "evolutionAnalyzer": {
               "tokenizer": "standard",
               "filter": [
                  "standard",
                  "lowercase",
                  "custom_shingle"
               ]
            }
         },
         "filter": {
            "custom_shingle": {
               "type": "shingle",
               "min_shingle_size": "2",
               "max_shingle_size": "4",
               "filler_token": "",
               "output_unigrams": True
            }
         }
      }
   },
   "mappings": {
      "doc": {
         "properties": {
            "text": {
                "type": "text",
               "analyzer": "evolutionAnalyzer",
               "search_analyzer": "standard",
               "term_vector": "yes"
            },
             "name": {
                "type": "keyword",
             },
             "train": {
                 "type" : "text"
             },
             "spam": {
                 "type" : "text"
             }
         }
      }
   }
}

In [11]:
# es.indices.create(index = 'spam_ham_2', body = request_body)

In [10]:
for path in data_paths:
    with open(path, encoding="utf8", errors='ignore') as file:
        index = 0
        fulltext = file.read().lower()

        if(fulltext.find("content-type: text/html;") != -1):
            soup = BeautifulSoup(fulltext, 'html.parser')
            fulltext = soup.get_text().replace("<", "").replace("/>","")

        lines = fulltext.split("\n")[1:]
        length = len(lines)
        text = []

        ignore_lines = ["Return-Path: ", "Date: ",  "From: ", "To: ", "Message-ID: ", "References: ",
                       "Content-Disposition: ", "Mime-Version: ", "In-Reply-To: ", "X-Virus-Scanned: ",
                       "X-Virus-Status: ", "User-Agent: ", "X-Chzlrs: ", "X-BeenThere: ", "X-Mailman-Version: ",
                       "Precedence: ", "List-Id: ", "List-Unsubscribe: ", "List-Archive: ", "List-Post: ",
                       "List-Help: ", "List-Subscribe: ", "Content-Type: ", "Content-Transfer-Encoding: ",
                       "Sender: ", "Errors-To: ", "X-Spam-Checker-Version: ", "X-Spam-Level: ",
                       "X-Spam-Status: ", "X-Mailer: ", "X-Priority: ", "X-Spam: ", "X-Miltered: ", "X-UUID: ",
                       "Status: ", "X-Miltered: ", "Content-Length: ", "Lines: ", "X-VirtualServer: ",
                       "X-VirtualServerGroup: ", "X-Destination-ID: ", "X-MailingID: ", "X-SMFBL: ",
                       "X-SMHeaderMap: ", "Lines: ", "X-Original-To: ", "Delivered-To: ", "x-mimeole: ",
                       "thread-index: ", "svn commit: ", "received: ", "received-spf: ", "x-spam-check-by: ",
                       "x-posted-by: ", "broadcastjobid: ", "content-class: ", "importance: ", 
                       "priority: ", "x-originalarrivaltime: ", "x-keywords: "]

        replace_words = ["subject: ", "reply-to: ", "re: ", "thread-topic: ", "ref"]
        while index < length:
            line = lines[index]

            if len(line) >= 9 :

                if line[0:9].lower() == "Received:".lower():
                    index = index + 2
                    line = lines[index]
                    
                
                if len(line) > 0 and line[0] == "\t":
                    index = index + 1
                    continue

            ig = False
            for word in ignore_lines:
                if line.lower().find(word.lower()) != -1:
                    ig = True
                    break

            if ig == True:
                index = index + 1
                continue

            line = line.lower()
            for word in replace_words:
                line = line.replace(word.lower(), "")

            line = line.replace("$","dollar").replace("%","percentage")
            text.append(line)
            index = index + 1
    fulltext = "\n".join(text)
    fulltext_token = tokenizer.tokenize(fulltext)



    final_text = []
    for word in fulltext_token:
        if word in english_words:
            final_text.append(word)
    
    file_name = path.split("/")[2]
    
    _body = {
        "text" : file_name,
        "name" : path,
        "train" : str(data_partition[path]),
        "spam" : str(spam_ham[file_name])
    }
   
    es.index(index='spam_ham', doc_type='doc', id=file_name, body=_body)
