In [2]:
import pandas as pd

data = pd.read_csv("spamdata.csv",encoding = "latin-1")

In [3]:
data.head(20)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
import string
punctuations = string.punctuation

from nltk.corpus import stopwords

stopwords_list = stopwords.words("english")

from nltk.stem.wordnet import WordNetLemmatizer

lem = WordNetLemmatizer()


def _clean(text):
    cleaned_text = text.lower()
    
    cleaned_text = "".join(c for c in text if c not in punctuations)
    
    words = cleaned_text.split()
    
    words = [w for w in words if w not in stopwords_list]
    
    words = [lem.lemmatize(word,"v") for word in words]
    words = [lem.lemmatize(word,"n") for word in words]
    
    cleaned_text =" ".join(words)
    
    return cleaned_text



In [5]:
data["cleaned"] = data["text"].apply(_clean)

In [6]:
#feature enginering
#meta features


data["word_count"] = data["text"].apply(lambda x : len(x.split()))
data["word_count_cleaned"] = data["cleaned"].apply(lambda x : len(x.split()))

data["char_count"] = data["text"].apply(lambda x : len(x))
data["char_count_without_spaces"] = data["text"].apply(lambda x : len(x.replace(" ","")))

data["num_digit"] = data["text"].apply(lambda x : sum([1 if w.isdigit() else 0 for w in x.split()]))


In [7]:
data.head()


Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_without_spaces,num_digit
0,ham,"Go until jurong point, crazy.. Available only ...",Go jurong point crazy Available bugis n great ...,20,16,111,92,0
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,6,6,29,24,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...,28,23,155,128,2
3,ham,U dun say so early hor... U c already then say...,U dun say early hor U c already say,11,9,49,39,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think go usf live around though,13,9,61,49,0


In [8]:
pos_dic={"noun":["NNP","NN","NNS","NNPS"],"verb":["VBZ","VB","VBD","VBN","VBG"]}

import nltk

def pos_check(text,family):
    tags = nltk.pos_tag(nltk.word_tokenize(text))
    count=0
    
    for tag in tags:
        tag = tag[1]
        if tag in pos_dic[family]:
            count+=1
    return count
    
#pos_check("i am playing in the ground","noun") 

In [9]:
data["noun_count"] = data["text"].apply(lambda x : pos_check(x,"noun"))
data["verb_count"] = data["text"].apply(lambda x : pos_check(x,"verb"))

In [10]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_without_spaces,num_digit,noun_count,verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",Go jurong point crazy Available bugis n great ...,20,16,111,92,0,10,1
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,6,6,29,24,0,4,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...,28,23,155,128,2,13,3
3,ham,U dun say so early hor... U c already then say...,U dun say early hor U c already say,11,9,49,39,0,3,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think go usf live around though,13,9,61,49,0,1,4


In [11]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

cvz = CountVectorizer()
cvz.fit(data["cleaned"].values)

count_vectors = cvz.transform(data["cleaned"].values)

In [12]:
count_vectors

<5572x8590 sparse matrix of type '<class 'numpy.int64'>'
	with 51048 stored elements in Compressed Sparse Row format>

In [13]:
word_tfidf = TfidfVectorizer(max_features=500)
word_tfidf.fit(data["cleaned"].values)

word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

In [14]:
ngram_tfidf = TfidfVectorizer(max_features=500,ngram_range=(1,2))
ngram_tfidf.fit(data["cleaned"].values)

ngram_tfidf = ngram_tfidf.transform(data["cleaned"].values)

In [15]:
char_tfidf = TfidfVectorizer(max_features=500,analyzer="char")
char_tfidf.fit(data["cleaned"].values)

char_tfidf = char_tfidf.transform(data["cleaned"].values)

In [16]:
tfidf = dict(zip(word_tfidf.get_feature_names(),word_tfidf.idf_))
tfidf_idf = pd.DataFrame(columns=["word_tfidf"]).from_dict(tfidf,orient="index")
tfidf_idf.columns = ["word_tfidf"]


In [17]:
from scipy.sparse import hstack, csr_matrix

meta_features = ['word_count', 'word_count_cleaned',
       'char_count', 'char_count_without_spaces', 'num_digit', 'noun_count',
       'verb_count']

feature_set1 = data[meta_features]

train = hstack([word_vectors_tfidf,csr_matrix(feature_set1)],"csr")
train

<5572x507 sparse matrix of type '<class 'numpy.float64'>'
	with 64155 stored elements in Compressed Sparse Row format>

In [18]:
from sklearn.preprocessing import LabelEncoder

target = data["label"].values
target = LabelEncoder().fit_transform(target)

In [19]:
target

array([0, 0, 1, ..., 0, 0, 0])

In [20]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(train, target)

In [21]:
train_x.shape

(4179, 507)

In [22]:
val_x.shape

(1393, 507)

In [23]:
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import accuracy_score

In [24]:
model = naive_bayes.MultinomialNB()
model.fit(train_x,train_y)
preds = model.predict(val_x)
accuracy_score(preds,val_y)

0.9770279971284996

In [25]:
model = LogisticRegression()
model.fit(train_x,train_y)
preds = model.predict(val_x)
accuracy_score(preds,val_y)



0.9770279971284996

In [26]:
model = svm.SVC()
model.fit(train_x,train_y)
preds = model.predict(val_x)
accuracy_score(preds,val_y)



0.9368269921033741

In [27]:
model = ensemble.ExtraTreesClassifier()
model.fit(train_x,train_y)
preds = model.predict(val_x)
accuracy_score(preds,val_y)



0.9748743718592965

In [30]:
import numpy as np

embeddings_index={}
for i, line in enumerate(open('pretrained.vec',encoding = "utf8")):
    if i ==0:
        continue
    value = line.split()
    embeddings_index[value[0]] = np.array(value[1:],dtype="float32")

In [31]:
from keras.preprocessing import text, sequence
token = text.Tokenizer()
token.fit_on_texts(data["text"])
word_index = token.word_index

Using TensorFlow backend.


In [33]:
word_index

{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'u': 6,
 'and': 7,
 'in': 8,
 'is': 9,
 'me': 10,
 'my': 11,
 'for': 12,
 'your': 13,
 'it': 14,
 'of': 15,
 'call': 16,
 'have': 17,
 'on': 18,
 '2': 19,
 'that': 20,
 'now': 21,
 'are': 22,
 'so': 23,
 'but': 24,
 'not': 25,
 'or': 26,
 'do': 27,
 'can': 28,
 'at': 29,
 "i'm": 30,
 'get': 31,
 'be': 32,
 'will': 33,
 'if': 34,
 'ur': 35,
 'with': 36,
 'just': 37,
 'no': 38,
 'we': 39,
 'this': 40,
 'gt': 41,
 '4': 42,
 'lt': 43,
 'up': 44,
 'when': 45,
 'ok': 46,
 'free': 47,
 'from': 48,
 'how': 49,
 'go': 50,
 'all': 51,
 'out': 52,
 'what': 53,
 'know': 54,
 'like': 55,
 'good': 56,
 'then': 57,
 'got': 58,
 'was': 59,
 'come': 60,
 'its': 61,
 'am': 62,
 'time': 63,
 'only': 64,
 'day': 65,
 'love': 66,
 'there': 67,
 'send': 68,
 'he': 69,
 'want': 70,
 'text': 71,
 'as': 72,
 'txt': 73,
 'one': 74,
 'going': 75,
 'by': 76,
 'home': 77,
 "i'll": 78,
 'need': 79,
 'about': 80,
 'r': 81,
 'lor': 82,
 'sorry': 83,
 'stop': 84,
 'st

In [35]:
trainx,valx,trainy,valy = train_test_split(data["text"],target)

trainx = sequence.pad_sequences(token.texts_to_sequences(trainx),maxlen=70)
valx = sequence.pad_sequences(token.texts_to_sequences(valx),maxlen=70)

embedding_matrix = np.zeros((len(word_index)+1,300))
for word,i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector
    

In [36]:
def train_model(classfier,feature_vector_train, label, feature_vector_val,valid_y):
    classifier.fit(feature_vector_train,label)
    predictions = classfier.predict(feature_vector_val)
    predictions = predictions.argmax(axis=-1)
    return accuracy_score(predictions,valid_y)

In [41]:
from keras import layers, models, optimizers

def create_cnn():
    input_layer = layers.Input((70,))
    embedding_layer = layers.Embedding(len(word_index)+1,300,weights = [embedding_matrix],trainable=False)(input_layer)
    conv_layer = layers.Convolution1D(100,3,activation='relu')(embedding_layer)
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)
    output_layer = layers.Dense(50,activation='relu')(pooling_layer)
    output_layer = layers.Dropout(.25)(output_layer)
    output_layer = layers.Dense(1, activation='sigmoid')(output_layer)
    
    model = models.Model(inputs = input_layer,outputs = output_layer)
    model.compile(optimizer=optimizers.Adam(),loss='binary_crossentropy')
    return model

    

In [42]:
classifier = create_cnn()

In [43]:
train_model(classifier,trainx,trainy,valx,valy)

Epoch 1/1


0.8765254845656856