In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier

In [4]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

In [6]:
lr = LogisticRegression()
svc = SVC(kernel='sigmoid',gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
abc = AdaBoostClassifier(n_estimators=50, random_state = 32)
bc = BaggingClassifier(n_estimators=50, random_state=32)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 32)
gbdt = GradientBoostingClassifier(n_estimators = 50,random_state = 32)
# xgb = XGBClassifier(n_estimators=50, random_state = 32)

In [7]:
mdls = {
    "mnb":mnb,
    "lr":lr,
    "svc":svc,
    "knc":knc,
    "dtc":dtc,
    "abc":abc,
    "bc":bc,
    "etc":etc,
    "gbdt":gbdt,
    # "xgb":xgb,
    }

In [42]:
# def predict_mdl(mdl, X_train, y_train, X_test, y_test):
#     mdl.fit(X_train, y_train)
#     print("model trained")
#     y_preds = mdl.predict(X_test)
#     return accuracy_score(y_preds, y_test), precision_score(y_preds, y_test)

def predict_mdl(mdl, X_train, y_train, X_test, y_test):
    """Trains model and returns (accuracy,precision)"""
    mdl.fit(X_train, y_train)
    y_preds = mdl.predict(X_test)

    # print('hi')
    print(accuracy_score(y_test,y_preds))
    print(confusion_matrix(y_test,y_preds))
    print(precision_score(y_test,y_preds))

    return accuracy_score(y_preds, y_test), precision_score(y_test, y_preds)

In [9]:
# predict_mdl(lr,X_train, y_train, X_test, y_test)

In [10]:
import pandas as pd

In [11]:
df = pd.read_csv('../data/spam_new.csv',encoding='latin-1')

In [12]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [13]:
df.rename(columns={'v1':'status', 'v2':'sms'},inplace=True)

In [14]:
df.drop(['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], axis= 1,inplace=True)

In [15]:
df['status'] = df['status'].map({"ham":0, "spam":1})

In [16]:
# from word_process import WordProcess

In [17]:
# wp = WordProcess()

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [20]:
# cv = CountVectorizer()
# cv.fit(X_train)

In [28]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords, wordnet
# from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import wordpunct_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /Users/suraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/suraj/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /Users/suraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/suraj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/suraj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [26]:
ps = PorterStemmer()
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [29]:
# df['processed'] = df['sms'].apply(wp.process_sent2sent)
df['processed'] = df['sms'].apply(transform_text)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df['processed'],
                                                    df['status'],test_size=0.2, random_state = 32)

In [31]:
cv = TfidfVectorizer(max_features=3000)
cv.fit(X_train)

In [32]:
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

In [33]:
import numpy as np

In [34]:
y_train_cv = np.expand_dims(np.array(y_train),axis=-1)
y_test_cv = np.expand_dims(np.array(y_test),axis=-1)

In [35]:
X_train_cv.shape

(4457, 3000)

In [36]:
mnb.fit(X_train_cv,y_train)
y_pred2 = mnb.predict(X_test_cv)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

0.9704035874439462
[[969   0]
 [ 33 113]]
1.0


In [43]:
# lr.fit(X_train_cv,y_train)
predict_mdl(mnb,X_train_cv, y_train, X_test_cv, y_test)

0.9704035874439462
[[969   0]
 [ 33 113]]
1.0


(0.9704035874439462, np.float64(1.0))

In [88]:
# predict_mdl(lr,X_train_cv, y_train, X_test_cv, y_test)

In [44]:
names = []
accuracies = []
precisions = []
for name, mdl in mdls.items():
    print()
    print("---===---"*5)
    print(name)
    (acc, pre) = predict_mdl(mdl, X_train_cv, y_train, X_test_cv, y_test)
    print(acc, pre)
    names.append(name)
    accuracies.append(acc)
    precisions.append(pre)


---===------===------===------===------===---
mnb
0.9704035874439462
[[969   0]
 [ 33 113]]
1.0
0.9704035874439462 1.0

---===------===------===------===------===---
lr
0.9659192825112107
[[968   1]
 [ 37 109]]
0.990909090909091
0.9659192825112107 0.990909090909091

---===------===------===------===------===---
svc
0.9802690582959641
[[967   2]
 [ 20 126]]
0.984375
0.9802690582959641 0.984375

---===------===------===------===------===---
knc
0.9192825112107623
[[969   0]
 [ 90  56]]
1.0
0.9192825112107623 1.0

---===------===------===------===------===---
dtc
0.9354260089686098
[[952  17]
 [ 55  91]]
0.8425925925925926
0.9354260089686098 0.8425925925925926

---===------===------===------===------===---
abc




0.9730941704035875
[[967   2]
 [ 28 118]]
0.9833333333333333
0.9730941704035875 0.9833333333333333

---===------===------===------===------===---
bc
0.9659192825112107
[[956  13]
 [ 25 121]]
0.9029850746268657
0.9659192825112107 0.9029850746268657

---===------===------===------===------===---
etc
0.9820627802690582
[[967   2]
 [ 18 128]]
0.9846153846153847
0.9820627802690582 0.9846153846153847

---===------===------===------===------===---
gbdt
0.9497757847533632
[[965   4]
 [ 52  94]]
0.9591836734693877
0.9497757847533632 0.9591836734693877


In [45]:
names,accuracies, precisions

(['mnb', 'lr', 'svc', 'knc', 'dtc', 'abc', 'bc', 'etc', 'gbdt'],
 [0.9704035874439462,
  0.9659192825112107,
  0.9802690582959641,
  0.9192825112107623,
  0.9354260089686098,
  0.9730941704035875,
  0.9659192825112107,
  0.9820627802690582,
  0.9497757847533632],
 [np.float64(1.0),
  np.float64(0.990909090909091),
  np.float64(0.984375),
  np.float64(1.0),
  np.float64(0.8425925925925926),
  np.float64(0.9833333333333333),
  np.float64(0.9029850746268657),
  np.float64(0.9846153846153847),
  np.float64(0.9591836734693877)])

In [46]:
rec = pd.DataFrame({"models":names,"accuracy":accuracies,"precision":precisions})

In [222]:
# rec['yt_tfidf_m3k_acc'] = accuracies
# rec['yt_tfidf_m3k_prec'] = precisions

In [67]:
# rec.sort_values(by=['yt_tfidf_m3k_prec','yt_tfidf_m3k_acc'],ascending=[False,False])
rec

Unnamed: 0,models,accuracy,precision
0,lr,0.965919,0.746575
1,svc,0.980269,0.863014
2,knc,0.913901,0.342466
3,mnb,0.970404,0.773973
4,dtc,0.935426,0.623288
5,abc,0.973094,0.808219
6,bc,0.965919,0.828767
7,etc,0.982063,0.876712
8,gbdt,0.949776,0.643836
9,xgb,0.967713,0.780822


In [113]:
# rec.to_csv("spam_lemm.csv",index=False)

In [84]:
text = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [101]:
mnb.predict(cv.transform([wp.process_sent2sent(text)]))

array([0])

In [88]:

text = df[df['status']==1].iloc[98]['sms']

In [100]:
ind = 70
text = X_test.iloc[ind]
y_test.iloc[ind]

0

In [37]:
transform_text(df['sms'].iloc[0])

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/suraj/nltk_data'
    - '/Applications/anaconda3/envs/aimlEnv/nltk_data'
    - '/Applications/anaconda3/envs/aimlEnv/share/nltk_data'
    - '/Applications/anaconda3/envs/aimlEnv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [169]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.porter import PorterStemmer


class WordProcess1:
    def __init__(self):
        # self.lemm = WordNetLemmatizer()
        self.lemm = PorterStemmer()
        nltk.download("stopwords")
        nltk.download('wordnet')
        nltk.download('averaged_perceptron_tagger')

    def get_wordnet_pos(self, pos):
        if pos.startswith('J'):
            return wordnet.ADJ
        elif pos.startswith('V'):
            return wordnet.VERB
        elif pos.startswith('N'):
            return wordnet.NOUN
        elif pos.startswith("R"):
            return wordnet.ADV
        else:
            return wordnet.NOUN
        
    def process_sentence(self, sent):
        # lowercasing
        sen = sent.lower()

        # tokenizing
        tkns = wordpunct_tokenize(sen)

        # removing stopwords and punctuations
        stops = stopwords.words('english')
        stops.extend(["..","...",])
        puncts = string.punctuation
        clean = []
        for word in tkns:
            if word not in stops and word not in puncts:
            # if len(word) > 1 and word not in stops and word not in puncts:
                clean.append(word)

        # word lemmatization
        word_tags = nltk.pos_tag(clean)
        word_lemm = []
        for word,tag in word_tags:
            # word_lemm.append(self.lemm.lemmatize(word,self.get_wordnet_pos(tag)))
            word_lemm.append(self.lemm.stem(word))

        sen = None
        tkns = None
        clean = None
        word_tags = None

        return word_lemm
    
    def process_sent2sent(self, sent):
        return " ".join(self.process_sentence(sent))

In [170]:
wp = WordProcess1()

[nltk_data] Downloading package stopwords to /Users/suraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/suraj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/suraj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [231]:
rec.to_csv("spam_models_v2-1Feb-0128.csv")

In [90]:
import pickle

In [94]:
pickle.dump(mnb, open("spam_mnb_model_1Feb_0202.pkl",'wb'))

In [95]:
pickle.dump(cv,open("spam_tfidf_vec_1Feb_0202.pkl",'wb'))