In [1]:
import elasticsearch
from elasticsearch_dsl import Search
import scipy.sparse as sparse
from sklearn.naive_bayes import GaussianNB

In [2]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn import tree
import numpy as np
from collections import Counter

In [3]:
es = elasticsearch.Elasticsearch()

In [4]:
results = elasticsearch.helpers.scan(es,
    index="spam_ham_2",
    doc_type="doc",
    preserve_order=True,
    query={"query": {"match_all": {}}},
)

index = 0
test_index = 0

train_doc_id_mapping = {}
train_id_doc_mapping = {}
train_is_spam = []


test_doc_id_mapping = {}
test_id_doc_mapping = {}
test_is_spam = []


for item in results:
    
    if item['_source']['train'] == "True":    
        train_doc_id_mapping[item['_id']] = index
        train_id_doc_mapping[index] = item['_id']
        is_spam = 0

        if item['_source']['spam'] == "True":
            is_spam = 1
            
        train_is_spam.append(is_spam)
        index = index + 1
        
    else:
        
           
        test_doc_id_mapping[item['_id']] = test_index
        test_id_doc_mapping[test_index] = item['_id']
        is_spam = 0

        if item['_source']['spam'] == "True":
            is_spam = 1
            
        test_is_spam.append(is_spam)
        test_index = test_index + 1
        

In [5]:
train_doc_id_mapping

{'inmail.10293': 44250,
 'inmail.18704': 39077,
 'inmail.72156': 21592,
 'inmail.26406': 12945,
 'inmail.44574': 48356,
 'inmail.59029': 30562,
 'inmail.50783': 25628,
 'inmail.7279': 9643,
 'inmail.62859': 31159,
 'inmail.73038': 58403,
 'inmail.68224': 45690,
 'inmail.47175': 41152,
 'inmail.14174': 34891,
 'inmail.73768': 17035,
 'inmail.73806': 53617,
 'inmail.47384': 20950,
 'inmail.42928': 22760,
 'inmail.42077': 4121,
 'inmail.11572': 54730,
 'inmail.54417': 27391,
 'inmail.51720': 18605,
 'inmail.25752': 6893,
 'inmail.18199': 16364,
 'inmail.72714': 21514,
 'inmail.55933': 26058,
 'inmail.5311': 41464,
 'inmail.30971': 44540,
 'inmail.58981': 20468,
 'inmail.21650': 12240,
 'inmail.28151': 7205,
 'inmail.43143': 23895,
 'inmail.37606': 4421,
 'inmail.11113': 704,
 'inmail.25795': 10409,
 'inmail.4092': 5920,
 'inmail.29174': 19113,
 'inmail.72308': 12521,
 'inmail.40953': 11318,
 'inmail.51666': 18894,
 'inmail.2882': 5762,
 'inmail.70998': 38391,
 'inmail.27686': 1222,
 'inma

In [6]:
len(train_doc_id_mapping),len(test_doc_id_mapping), len(train_is_spam), len(test_is_spam)

(60336, 15083, 60336, 15083)

In [7]:
with open("features_provided") as file:
    features = file.read().split(" ")[:-1]

In [8]:
feature_id_mapping = {}
id_feature_mapping = {}
index = 0

for word in features:
    if word.lower() not in feature_id_mapping:
        feature_id_mapping[word.lower()] = index
        id_feature_mapping[index] = word.lower()
        index = index + 1
    

In [9]:
index

72

In [10]:
row = []
col = []
data = []


row_test = []
col_test = []
data_test = []

docs_analysed = {}
body_doc = {
    "query": {    
        "match" : {
            "text" : ""
            }
        },
    "from" : 0,
    "size" : 1000
}

for i in feature_id_mapping:
    print(i , end=", ")
    body_doc["query"]["match"]["text"] = i
    total_res = []
    hits = es.search(index="spam_ham_2", body=body_doc)
    total_res = total_res + hits['hits']['hits']
    
    while len(total_res) < hits['hits']['total']:
        body_doc["from"] = len(total_res)
        hits = es.search(index="spam_ham_2", body=body_doc, size=1000)
        total_res = total_res + hits['hits']['hits']
        
        for doc in total_res:
            if doc['_id'] not in train_doc_id_mapping:
                
                data_test.append(doc['_score'])
                row_test.append(test_doc_id_mapping[doc['_id']])
                col_test.append(feature_id_mapping[i])

            else:
                data.append(doc['_score'])
                row.append(train_doc_id_mapping[doc['_id']])
                col.append(feature_id_mapping[i])

earn, check, profit, instant, win, our, deal, now, incredible, refinance, shopper, unsolicited, extra, lose, malware, valium, spam, trial, singles, chance, price, act, meet, freedom, clearance, credit, bonus, virus, viagra, double, home, $$$, performance, subscribe, compare, hidden, join, legal, miracle, weight, interest, celebrity, money, buy, website, medicine, lower, affordable, 100%, fast, loans, ad, dollars, rate, click, visit, here, success, limited, luxury, order, cash, collect, prize, $discount, free, remove, sales, million, diagnostics, no, only, 

In [11]:
len(data),len(row),len(col),len(feature_id_mapping)

(1931147, 1931147, 1931147, 72)

In [12]:
sparse_mat_train = sparse.csr_matrix((data, (row, col)), shape=(len(train_doc_id_mapping), len(feature_id_mapping)))

In [13]:
len(data_test),len(row_test),len(col_test),len(feature_id_mapping)

(484191, 484191, 484191, 72)

In [14]:
sparse_mat_test = sparse.csr_matrix((data_test, (row_test, col_test)), shape=(len(test_doc_id_mapping), len(feature_id_mapping)))

In [15]:
Counter(train_is_spam), Counter(test_is_spam)

(Counter({0: 20204, 1: 40132}), Counter({0: 5016, 1: 10067}))

In [16]:
clf = tree.DecisionTreeClassifier()
clf.fit(sparse_mat_train, train_is_spam)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [17]:
print(accuracy_score(clf.predict(sparse_mat_train), train_is_spam))

0.9906026252983293


In [18]:
test_pred = clf.predict(sparse_mat_test)

In [19]:
print(accuracy_score(test_pred, test_is_spam))

0.9620102101703905


In [20]:
clf_pf = GaussianNB()

In [21]:
clf_pf.fit(sparse_mat_train.toarray(), train_is_spam)

GaussianNB(priors=None)

In [22]:
test_pred = clf_pf.predict(sparse_mat_test.toarray())

In [25]:
body_doc = {
  "query" :{
    "match": {
      "name": ""
    }
  } 
}

c = 1
for i in np.argsort(test_pred)[::-1][0:50]:
    
    body_doc["query"]["match"]["name"] = test_id_doc_mapping[i]
    hits = es.search(index="spam_ham_2", body=body_doc)
    print(c,"\t", test_id_doc_mapping[i], "\t","spam:",hits["hits"]["hits"][0]["_source"]["spam"])
    c = c + 1

1 	 inmail.42896 	 spam: True
2 	 inmail.39649 	 spam: True
3 	 inmail.5688 	 spam: True
4 	 inmail.15064 	 spam: True
5 	 inmail.1306 	 spam: True
6 	 inmail.74070 	 spam: True
7 	 inmail.36791 	 spam: True
8 	 inmail.38910 	 spam: True
9 	 inmail.57998 	 spam: True
10 	 inmail.59744 	 spam: True
11 	 inmail.57565 	 spam: True
12 	 inmail.73441 	 spam: True
13 	 inmail.6926 	 spam: True
14 	 inmail.14116 	 spam: True
15 	 inmail.73986 	 spam: True
16 	 inmail.34758 	 spam: True
17 	 inmail.48208 	 spam: True
18 	 inmail.7715 	 spam: True
19 	 inmail.45086 	 spam: True
20 	 inmail.70786 	 spam: True
21 	 inmail.33269 	 spam: True
22 	 inmail.38008 	 spam: False
23 	 inmail.21201 	 spam: True
24 	 inmail.4340 	 spam: False
25 	 inmail.46713 	 spam: True
26 	 inmail.36741 	 spam: True
27 	 inmail.46655 	 spam: True
28 	 inmail.64290 	 spam: True
29 	 inmail.69940 	 spam: True
30 	 inmail.65610 	 spam: True
31 	 inmail.53289 	 spam: True
32 	 inmail.54972 	 spam: True
33 	 inmail.34802 	 