In [None]:
import pandas as pd
import numpy as np
import re 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
import gensim
nltk.download('punkt')
nltk.download('stopwords')
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix,hstack
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
import pickle
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
class TextFeaturization():

  def extract_email_ids(self,doc):
    '''This functions extract the email ids and domain names in the email adderss and returns a list of preprocessed email ids'''
    list_of_preproessed_emails = []
    list_of_emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',doc)
    doc = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'," ",doc)
    for txt in list_of_emails:
      email = re.split("[.]",re.split("@", txt)[1])
      y=email.copy()
      for i in email:
        if i=="com" or len(i)<=2:
          y.remove(i)
      email = ' '.join([str(i) for i in y])
      email = email.lower()
      list_of_preproessed_emails.append(email)
    list_of_preproessed_emails = " ".join(list_of_preproessed_emails)
    return list_of_preproessed_emails

  def text_lowercase(self,doc):
    ''' This function converts the text to lower case'''
    return doc.lower()

  def remove_digits(self, doc):
    '''This function removes all the numbers'''
    return re.sub('\d', '', doc)

  def remove_underscores(self, doc):
    '''This function removes all the underscores'''
    return re.sub(r'_', '', doc)

  def remove_excess_whitespace(self, doc):
    '''This function removes excess white spaces'''
    return re.sub('\s+', ' ', doc)

  def remove_special_characters(self, doc):
    '''This function removes all the special characters'''
    return re.sub('\W', ' ', doc)

  def remove_within_brackets(self, doc):
    '''This function removes all the content within brackets'''
    text = re.sub(r'\([^()]*\)', '', doc)
    text = re.sub(r'<[^()]*>', '', text)
    return text

  def expand_words(self, phrase):
    '''This function expands the short form words '''
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

  def remove_short_and_long_words(self, doc):
    '''This function removes all the short(<2 letters) and long(>15 letters) words '''
    words = doc.split()
    word_list = []
    for word in words:
      if len(word) > 2 and len(word) < 15 :
        word_list.append(word)
    return ' '.join(word_list)

  

  def text_lematizer(self,doc):
    '''This function lematize the words to its root words'''
    nlp = spacy.load('en', disable=['parser', 'ner'])
    doc = nlp(doc)
    return " ".join([token.lemma_ for token in doc])

  def clean_document(self,doc):
    '''This function cleans the documents'''
    doc = self.text_lowercase(doc)
    ids = self.extract_email_ids(doc)
    doc = self.remove_within_brackets(doc)
    doc = self.expand_words(doc)
    doc = self.remove_underscores(doc)
    doc = self.remove_special_characters(doc)
    doc = self.remove_digits(doc)
    doc = self.remove_excess_whitespace(doc)
    doc = self.remove_short_and_long_words(doc)
    doc = self.text_lematizer(doc)
    doc = ids+doc
    return doc
  

  def remove_stopwords(self,doc,stopword_list):
    '''This function removes the stop words'''
    word_list = doc.split(" ")
    cleaned_txt = [w for w in word_list if not w in stopword_list]
    cleaned_string = " ".join(cleaned_txt)
    
    return cleaned_string
    
  def word_count(self,doc):#need to change to corpus
    '''This function retuns an array of word count in each document'''
    return len(doc.split())

  def average_word_length(self,doc):
    '''This function returns an array of average word length in each document'''
    total_length = 0
    for i in doc.split():
      total_length += len(i)
    return (total_length/len(doc.split()))


  def tfidf_test(self,data):
    '''This function creates TFIDF representation of test data'''
    tfidf_model = pickle.load(open('tfidf_model.sav', 'rb'))
    return tfidf_model.transform([data])


  def perform_lda(self,data):
    '''This function performs lda on test data'''
    vectorizer_bow = pickle.load(open('bow_model.sav', 'rb'))
    lda_model = pickle.load(open('lda_model.sav', 'rb'))
    bow = vectorizer_bow.transform([data])
    return lda_model.transform(bow)

  def featurize(self,X_test,email_stopwords):
    '''This function featurize text data as w2V,word length,avg word length,lda topic modelling for given train and test data'''
    email_data = []
    test_doc = self.remove_stopwords(X_test,email_stopwords)

    word_count_test = self.word_count(test_doc)
    avg_word_len_test = self.average_word_length(test_doc)
    lda_test = self.perform_lda(test_doc)
    tfidf_test_vec = self.tfidf_test(test_doc)
    email_data.extend([word_count_test,avg_word_len_test])
    email_data.extend(lda_test[0])
    email_data.extend(np.asarray(tfidf_test_vec.todense()[0])[0])
    return np.array(email_data).reshape(1, -1)


In [None]:
class EmailClassification(TextFeaturization):

  def classify(self,data):
    with open('stopword.pkl', 'rb') as f:
      stopwords = pickle.load(f) 
    data = self.clean_document(data)
    data = self.featurize(data,stopwords)
    clf = pickle.load(open('rf_model.sav', 'rb'))
    return ("SPAM" if clf.predict(data)==1 else "HAM")

In [None]:
data = '''From rssfeeds@jmason.org Fri Oct 4 11:02:10 2002 Return-Path: <rssfeeds@spamassassin.taint.org> Delivered-To: yyyy@localhost.spamassassin.taint.org Received: from localhost (jalapeno [127.0.0.1]) by jmason.org (Postfix) with ESMTP id 3C0DA16F6F for <jm@localhost>; Fri, 4 Oct 2002 11:01:47 +0100 (IST) Received: from jalapeno [127.0.0.1] by localhost with IMAP (fetchmail-5.9.0) for jm@localhost (single-drop); Fri, 04 Oct 2002 11:01:47 +0100 (IST) Received: from dogma.slashnull.org (localhost [127.0.0.1]) by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g9480QK08803 for <jm@jmason.org>; Fri, 4 Oct 2002 09:00:26 +0100 Message-Id: <200210040800.g9480QK08803@dogma.slashnull.org> To: yyyy@spamassassin.taint.org From: boingboing <rssfeeds@spamassassin.taint.org> Subject: Don't do the brown WiFi, the brown WiFi is BAD Date: Fri, 04 Oct 2002 08:00:25 -0000 Content-Type: text/plain; encoding=utf-8 URL: http://boingboing.net/#85515860 Date: Not supplied Rob "Pringles Can" Flickenger and others Cliff Skolnik at the O'Reilly OS X con has tracked down the cause of the annoying flakiness in the wireless network here -- every 20 or 30 seconds, you start getting "connection refused" messages from your browser and other net-utilities. Rob "Pringles Can" Flickenger wrote it up. It turns out that running the great network-spy app Etherpeg[1] (or other "promiscuous" network sniffers) and the built-in firewall in OS X at the same time causes your computer to begin intercepting every packet sent out on your segment of the wireless network and respond to it with a "rejected" message. So today, Rob (and everyone else who knows about this) is going to run around and tell people running Etherpeg to _turn off the firewall_ (and vice-versa). Ah, fickle networking, you are such a stern mistress! Link[2] Discuss[3] (_ Thanks, Rob!_) [1] http://www.oreillynet.com/pub/wlg/1414 [2] http://www.oreillynet.com/pub/wlg/2086 [3] http://www.quicktopic.com/boing/H/bfYib9hETQSA'''

In [None]:
clf = EmailClassification()
clf.classify(data)

'HAM'