# Import Essential Libraries

In [None]:
import numpy as np
import pandas as pd
from string import punctuation
import re
import nltk
from nltk import word_tokenize


# Load data into the environment

In [None]:
sms_data = pd.read_csv("/content/spam.csv", encoding = 'latin-1')
sms_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Filter Out the unnecessary columns

In [None]:
cols = sms_data.columns[:2]

In [None]:
data = sms_data[cols]

In [None]:
print(data.shape)

(5572, 2)


Let us rename the columns

In [None]:
data = data.rename(columns = {"v1":"Value", "v2":"Text"})
print(data.head())

  Value                                               Text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
# Let us check whether the data is skewed or not
print(data["Value"].value_counts())

ham     4825
spam     747
Name: Value, dtype: int64


We will use stratied sampling during the training of the data

In [None]:
# Let is create a new feature pucntuation which will have length of
# all the punctuation character in the message
punctuation = list(punctuation)
data["Punctuations"] = data["Text"].apply(lambda x: len(re.findall(r"^\w+&&^\s]", x)))

# We will create a new feature Phone number to see whether the
# sms contains the sms text or not
data["Phone"] = data["Text"].apply(lambda x: len(re.findall(r"[0-9]{10}", x)))

# We will create a new feature to check whether message has link or not
is_link = lambda x: 1 if re.search(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", x)!=None else 0
data['Links'] = data["Text"].apply(is_link)

# Create a new feature called Uppercase
# This feature indicates how many words are in the sms text in Uppercase
count_upper = lambda x: list(map(str.isupper, x.split())).count(True)
data["Uppercase"] = data["Text"].apply(count_upper)
print(data.head())

  Value                                               Text  Punctuations  \
0   ham  Go until jurong point, crazy.. Available only ...             0   
1   ham                      Ok lar... Joking wif u oni...             0   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...             0   
3   ham  U dun say so early hor... U c already then say...             0   
4   ham  Nah I don't think he goes to usf, he lives aro...             0   

   Phone  Links  Uppercase  
0      0      0          0  
1      0      0          0  
2      1      0          2  
3      0      0          2  
4      0      0          1  


In [None]:
# Indentify how many words are there in the SMS
nltk.download('punkt')
nltk.download('words')

def find_unusual_words(text):
  text_vocab_Set = set(w.lower() for w in text if w.isalpha())
  english_vocab_set = set(w.lower() for w in nltk.corpus.words.words())
  unusual_set = text_vocab_Set - english_vocab_set
  return len(sorted(unusual_set))

data["Ususualwords"] = data["Text"].apply(lambda x: find_unusual_words(word_tokenize(x)))

# Print few of the records after creating the features
print(data[17:29])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


KeyboardInterrupt: ignored

Create the TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(stop_words = "english", strip_accents = 'ascii', max_features = 300)
tf_idf_matrix = tf_idf.fit_transform(data["Text"])

We have set that we will only have maximum of 300 commons words after removing punctuation, stopwords, accents etc.

In [None]:
data_extra_features = pd.concat([data, pd.DataFrame(tf_idf_matrix.toarray(), columns = tf_idf.get_feature_names_out())], axis = 1)

In [None]:
data_extra_features.head()

Unnamed: 0,Value,Text,Punctuations,Phone,Links,Uppercase,000,10,150p,150ppm,...,world,www,xmas,xxx,ya,yeah,year,yes,yo,yup
0,ham,"Go until jurong point, crazy.. Available only ...",0,0,0,0,0.0,0.0,0.0,0.0,...,0.594379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ham,Ok lar... Joking wif u oni...,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0,1,0,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ham,U dun say so early hor... U c already then say...,0,0,0,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0,0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.model_selection import train_test_split
X=data_extra_features
features = X.columns.drop(["Value","Text"])
target = ["Value"]
X_train,X_test,y_train,y_test = train_test_split(X[features],X[target])


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(min_samples_split=40)
dt.fit(X_train,y_train)
pred = dt.predict(X_test)
print(accuracy_score(y_train, dt.predict(X_train)))
print(accuracy_score(y_test, pred))


0.985163914812156
0.9691313711414213


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Building a Naive Bayes Model
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
pred_mnb = mnb.predict(X_test)
print(accuracy_score(y_test, pred_mnb))
# Building a Logistic Regression Model
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred_lr = lr.predict(X_test)
print(accuracy_score(y_test, pred_lr))


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.968413496051687
0.9813352476669059
