In [38]:
import elasticsearch
from elasticsearch_dsl import Search
import scipy.sparse as sparse
from sklearn import tree
import numpy as np

In [2]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score


In [3]:
es = elasticsearch.Elasticsearch()

In [4]:
results = elasticsearch.helpers.scan(es,
    index="spam_ham",
    doc_type="doc",
    preserve_order=True,
    query={"query": {"match_all": {}}},
)

index = 0
test_index = 0

train_doc_id_mapping = {}
train_id_doc_mapping = {}
train_is_spam = []


test_doc_id_mapping = {}
test_id_doc_mapping = {}
test_is_spam = []


for item in results:
    
    if item['_source']['train'] == "True":    
        train_doc_id_mapping[item['_id']] = index
        train_id_doc_mapping[index] = item['_id']
        is_spam = 0

        if item['_source']['spam'] == "True":
            is_spam = 1
            
        train_is_spam.append(is_spam)
        index = index + 1
        
    else:
        
           
        test_doc_id_mapping[item['_id']] = test_index
        test_id_doc_mapping[test_index] = item['_id']
        is_spam = 0

        if item['_source']['spam'] == "True":
            is_spam = 1
            
        test_is_spam.append(is_spam)
        test_index = test_index + 1
        

In [5]:
len(train_doc_id_mapping),len(test_doc_id_mapping), len(train_is_spam), len(test_is_spam)

(60336, 15083, 60336, 15083)

In [6]:
with open("feature_words") as file:
    features = file.read().split("\n")[:-1]

In [7]:
feature_id_mapping = {}
id_feature_mapping = {}
index = 0

for word in features:
    feature_id_mapping[word.lower()] = index
    id_feature_mapping[index] = word.lower()
    index = index + 1
    

In [8]:
docs_analysed = {}
body_doc = {
    "query": {    
        "match_phrase" : {
            "text" : ""
            }
        },
    "from" : 0,
    "size" : 1000
}

for i in feature_id_mapping:
    print(i)
    body_doc["query"]["match_phrase"]["text"] = i
    total_res = []
    hits = es.search(index="spam_ham", body=body_doc)
    total_res = total_res + hits['hits']['hits']
    
    while len(total_res) < hits['hits']['total']:
        body_doc["from"] = len(total_res)
        hits = es.search(index="spam_ham", body=body_doc, size=1000)
        total_res = total_res + hits['hits']['hits']
    
    if len(total_res) == 0:
        continue
    else:
        for doc in total_res:
            if doc["_id"] in docs_analysed:
                continue
            docs_analysed[doc["_id"]] = {}
            term_vects = es.termvectors(index = 'spam_ham', 
               doc_type = 'doc', 
               id = doc["_id"], 
               fields = ["text"], 
               payloads=False, 
               positions=False,
               offsets=False,
               term_statistics=True,
               field_statistics=True)
            
            for word in features:
                if(word.lower() in term_vects['term_vectors']['text']['terms']):
                    tf = term_vects['term_vectors']['text']['terms'][word]["term_freq"]
                    docs_analysed[doc["_id"]][feature_id_mapping[word.lower()]] = tf

authentic dollar discount
lower interest
fast delivery
porn
fast delivery guaranty
use a credit card
click to play
visit our website
click me
click here
last 10 minute
affordable fast price
mass gain
tinder
will make you
fast cash
remove ads
shocked to see
twitter reacts to
could possibly even happen
last 10 minutes
dollar discount
get for
shock to see
lose weight
weight loss
clearance sale
do not miss it
discount
blew my mind
free
dont miss it
free fast cash
ads
twitter react to
win extra cash
blew your mind
free bottle
this is why
see more


In [9]:
row = []
col = []
data = []
# label_train = []


row_test = []
col_test = []
data_test = []
# label_test = []

for doc in docs_analysed:
    for term in docs_analysed[doc]:
        if doc in train_doc_id_mapping:
            
            row.append(train_doc_id_mapping[doc])
            col.append(term)
            data.append(docs_analysed[doc][term])

        else:
            row_test.append(test_doc_id_mapping[doc])
            col_test.append(term)
            data_test.append(docs_analysed[doc][term])
            

In [10]:
sparse_mat_train = sparse.csr_matrix((data, (row, col)), shape=(len(train_doc_id_mapping), len(features)))

In [11]:
sparse_mat_test = sparse.csr_matrix((data_test, (row_test, col_test)), shape=(len(test_doc_id_mapping), len(features)))

In [12]:
sparse_mat_train

<60336x40 sparse matrix of type '<class 'numpy.int64'>'
	with 15388 stored elements in Compressed Sparse Row format>

In [13]:
sparse_mat_test

<15083x40 sparse matrix of type '<class 'numpy.int64'>'
	with 3846 stored elements in Compressed Sparse Row format>

In [31]:
clf = tree.DecisionTreeClassifier()
clf.fit(sparse_mat_train, train_is_spam)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [32]:
pred = clf.predict(sparse_mat_train)

In [33]:
print(accuracy_score(pred, train_is_spam))

0.6880635110050385


In [35]:
test_pred = clf.predict(sparse_mat_test)

In [36]:
print(accuracy_score(test_pred, test_is_spam))

0.6913743950142545


In [39]:
body_doc = {
  "query" :{
    "match": {
      "name": ""
    }
  } 
}

c = 1
for i in np.argsort(test_pred)[::-1][0:50]:
    
    body_doc["query"]["match"]["name"] = "trec07p/data/" + test_id_doc_mapping[i]
    hits = es.search(index="spam_ham", body=body_doc)
    print(c,"\t", test_id_doc_mapping[i], "\t","spam:",hits["hits"]["hits"][0]["_source"]["spam"])
    c = c + 1

1 	 inmail.24099 	 spam: True
2 	 inmail.61605 	 spam: False
3 	 inmail.59977 	 spam: True
4 	 inmail.15954 	 spam: True
5 	 inmail.70841 	 spam: True
6 	 inmail.16037 	 spam: True
7 	 inmail.10816 	 spam: True
8 	 inmail.32611 	 spam: True
9 	 inmail.31870 	 spam: True
10 	 inmail.74372 	 spam: True
11 	 inmail.39003 	 spam: True
12 	 inmail.13073 	 spam: False
13 	 inmail.36632 	 spam: True
14 	 inmail.6901 	 spam: True
15 	 inmail.41744 	 spam: False
16 	 inmail.36447 	 spam: False
17 	 inmail.62037 	 spam: False
18 	 inmail.21792 	 spam: True
19 	 inmail.2912 	 spam: True
20 	 inmail.33655 	 spam: False
21 	 inmail.30505 	 spam: True
22 	 inmail.40775 	 spam: False
23 	 inmail.11921 	 spam: True
24 	 inmail.34322 	 spam: False
25 	 inmail.42998 	 spam: False
26 	 inmail.71544 	 spam: True
27 	 inmail.67739 	 spam: False
28 	 inmail.60546 	 spam: False
29 	 inmail.73291 	 spam: True
30 	 inmail.23132 	 spam: True
31 	 inmail.42418 	 spam: False
32 	 inmail.6081 	 spam: True
33 	 inm