In [1]:
import elasticsearch
from elasticsearch_dsl import Search
import scipy.sparse as sparse
from sklearn import linear_model
from sklearn.metrics import accuracy_score
import scipy
import numpy as np

In [2]:
es = elasticsearch.Elasticsearch()

In [3]:
results = elasticsearch.helpers.scan(es,
    index="spam_ham_2",
    doc_type="doc",
    preserve_order=True,
    query={"query": {"match_all": {}}},
)

index = 0
test_index = 0

train_doc_id_mapping = {}
train_id_doc_mapping = {}
train_is_spam = []


test_doc_id_mapping = {}
test_id_doc_mapping = {}
test_is_spam = []


for item in results:
    
    if item['_source']['train'] == "True":    
        train_doc_id_mapping[item['_id']] = index
        train_id_doc_mapping[index] = item['_id']
        is_spam = 0

        if item['_source']['spam'] == "True":
            is_spam = 1
            
        train_is_spam.append(is_spam)
        index = index + 1
        
    else:
        
           
        test_doc_id_mapping[item['_id']] = test_index
        test_id_doc_mapping[test_index] = item['_id']
        is_spam = 0

        if item['_source']['spam'] == "True":
            is_spam = 1
            
        test_is_spam.append(is_spam)
        test_index = test_index + 1
        

In [4]:
vocab = {}
reverse_vocab = {}
vocab_index = 0

In [5]:
data = []
column = []
row = []

In [6]:
for doc in train_doc_id_mapping:
    
    term_vects = es.termvectors(index = 'spam_ham_2', 
       doc_type = 'doc', 
       id = doc, 
       fields = ["text"], 
       payloads=False, 
       positions=False,
       offsets=False,
       term_statistics=True,
       field_statistics=True)
    
    if 'text' in term_vects['term_vectors'].keys():
        for term in term_vects['term_vectors']['text']['terms']:

            if term.lower() not in vocab:
                vocab[term.lower()] = vocab_index
                vocab_index = vocab_index + 1


            tf = term_vects['term_vectors']['text']['terms'][term]["term_freq"]

            row.append(train_doc_id_mapping[doc])
            column.append(vocab[term.lower()])
            data.append(tf)

In [7]:
vocab_index

9958074

In [8]:
sparse_mat_train = sparse.csr_matrix((data, (row, column)), shape=(len(train_doc_id_mapping), vocab_index))

In [9]:
scipy.sparse.save_npz('sparse_mat_train_ft.npz', sparse_mat_train)

In [10]:
np.save('train_label_ft.npy', np.array(train_is_spam)) 

In [11]:
regr = linear_model.LogisticRegression()

In [12]:
regr.fit(sparse_mat_train, train_is_spam)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
pred = regr.predict(sparse_mat_train)

In [14]:
print(accuracy_score(pred, train_is_spam))

0.999817687616017


In [15]:
data = []
column = []
row = []

for doc in test_doc_id_mapping:
    
    term_vects = es.termvectors(index = 'spam_ham_2', 
       doc_type = 'doc', 
       id = doc, 
       fields = ["text"], 
       payloads=False, 
       positions=False,
       offsets=False,
       term_statistics=True,
       field_statistics=True)
    
    if 'text' in term_vects['term_vectors'].keys():
        for term in term_vects['term_vectors']['text']['terms']:

            if term.lower() in vocab:

                tf = term_vects['term_vectors']['text']['terms'][term]["term_freq"]

                row.append(test_doc_id_mapping[doc])
                column.append(vocab[term.lower()])
                data.append(tf)

In [16]:
len(data)

10869140

In [17]:
sparse_mat_test = sparse.csr_matrix((data, (row, column)), shape=(len(test_doc_id_mapping), vocab_index))

In [18]:
scipy.sparse.save_npz('sparse_mat_test_ft.npz', sparse_mat_test)

In [19]:
np.save('test_label_ft.npy', np.array(test_is_spam)) 

In [20]:
test_pred = regr.predict(sparse_mat_test)

In [21]:
print(accuracy_score(test_pred, test_is_spam))

0.9989392030763111


In [22]:
# sparse_matrixaklfnlwkfnl = scipy.sparse.load_npz('sparse_mat_test.npz')

In [24]:
body_doc = {
  "query" :{
    "match": {
      "name": ""
    }
  } 
}

c = 1
for i in np.argsort(test_pred)[::-1][0:50]:
    
    body_doc["query"]["match"]["name"] = test_id_doc_mapping[i]
    hits = es.search(index="spam_ham_2", body=body_doc)
    print(c,"\t", test_id_doc_mapping[i], "\t","spam:",hits["hits"]["hits"][0]["_source"]["spam"])
    c = c + 1

1 	 inmail.68729 	 spam: True
2 	 inmail.40528 	 spam: True
3 	 inmail.21810 	 spam: True
4 	 inmail.31109 	 spam: True
5 	 inmail.9142 	 spam: True
6 	 inmail.2939 	 spam: True
7 	 inmail.6366 	 spam: True
8 	 inmail.35246 	 spam: True
9 	 inmail.34605 	 spam: True
10 	 inmail.66122 	 spam: True
11 	 inmail.41450 	 spam: True
12 	 inmail.51 	 spam: True
13 	 inmail.66046 	 spam: True
14 	 inmail.9744 	 spam: True
15 	 inmail.72770 	 spam: True
16 	 inmail.63900 	 spam: True
17 	 inmail.41721 	 spam: True
18 	 inmail.6896 	 spam: True
19 	 inmail.39692 	 spam: True
20 	 inmail.62180 	 spam: True
21 	 inmail.48899 	 spam: True
22 	 inmail.29295 	 spam: True
23 	 inmail.49034 	 spam: True
24 	 inmail.43318 	 spam: True
25 	 inmail.68175 	 spam: True
26 	 inmail.6675 	 spam: True
27 	 inmail.7709 	 spam: True
28 	 inmail.62356 	 spam: True
29 	 inmail.50292 	 spam: True
30 	 inmail.38600 	 spam: True
31 	 inmail.67126 	 spam: True
32 	 inmail.646 	 spam: True
33 	 inmail.49138 	 spam: Tru