In [9]:
import nltk
from nltk import *
from nltk.corpus import brown
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from scipy.spatial.distance import cosine, euclidean, jaccard
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score

Read CSV file

In [2]:
data = pd.read_csv(filepath_or_buffer="data/spam.csv", encoding='latin-1')
print(data.head(2))
cols = data.columns[:2]
sms_data = data[cols]
sms_data = sms_data.rename(columns={'v1':'Value', 'v2':'Text'})
sms_data.head()


    v1                                                 v2 Unnamed: 2  \
0  ham  Go until jurong point, crazy.. Available only ...        NaN   
1  ham                      Ok lar... Joking wif u oni...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  


Unnamed: 0,Value,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Feature Engineering

In [3]:
# phone number feature
sms_data['Phone Number'] = sms_data['Text'].apply(lambda x: len(re.findall(r"[0-9]{10}", x)))
# find https link
is_link = lambda x:1 if re.search(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", x)!=None else 0
sms_data['Link'] = sms_data['Text'].apply(is_link)
def get_unusual_words(text):
    text_vocab_set = set(r.lower() for r in text if r.isalpha())
    english_vocab_set = set(r.lower() for r in nltk.corpus.words.words())
    unusual_set = text_vocab_set - english_vocab_set
    return sorted(unusual_set)
# sms_data['Unusual Words'] = sms_data['Text'].apply(get_unusual_words)
sms_data.head()

Unnamed: 0,Value,Text,Phone Number,Link
0,ham,"Go until jurong point, crazy.. Available only ...",0,0
1,ham,Ok lar... Joking wif u oni...,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,0
3,ham,U dun say so early hor... U c already then say...,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0


In [5]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=300, strip_accents='ascii')
tfidf_matrix = tfidf.fit_transform(raw_documents=sms_data['Text'])

In [7]:
data_extra_features = pd.concat([sms_data, pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())], axis=1)
data_extra_features.head()

Unnamed: 0,Value,Text,Phone Number,Link,000,10,150p,150ppm,16,18,...,world,www,xmas,xxx,ya,yeah,year,yes,yo,yup
0,ham,"Go until jurong point, crazy.. Available only ...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.594379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ham,Ok lar... Joking wif u oni...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ham,U dun say so early hor... U c already then say...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Train a Decision Tree Classifier

In [14]:
# train test split
features = sms_data.columns.drop(['Value', 'Text'])
x_train, x_test, y_train, y_test = train_test_split(sms_data[features], sms_data['Value'])

# classifier train
dt = DecisionTreeClassifier(min_samples_split=40)
dt.fit(x_train, y_train)

print(accuracy_score(y_test, dt.predict(x_test)))


0.9418521177315147
