In [5]:
import kaggle

In [3]:
kaggle.api.authenticate()

In [4]:
kaggle.api.dataset_download_files('uciml/sms-spam-collection-dataset/', path='data', unzip=True)

In [17]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")
  
df = pd.read_csv('data/spam.csv', encoding="ISO-8859-1")
print('The shape of the dataset is:', df.shape)

The shape of the dataset is: (5572, 5)


In [18]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### Exploratory Data Analysis

In [25]:
updated_df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [26]:
updated_df.head()


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
model_feature = 'v2'
model_target = 'v1'

### Text processing

In [30]:
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [32]:
updated_df.isnull().sum()

v1    0
v2    0
dtype: int64

In [33]:
updated_df[model_feature] = updated_df[model_feature].astype('str')

In [39]:
from stop_words import get_stop_words

stop_words = get_stop_words('english')

In [40]:
print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 't

In [41]:
import re, string
import nltk
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')

def preProcessText(text):
    # fill in here

    #lowercase and strip white space
    text = text.lower().strip()

   
    return text

def lexiconProcess(text, stop_words, stemmer):
    # fill in here
    filtered_sentence = []
    words = text.split(" ")
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(stemmer.stem(w))

    text = " ".join(filtered_sentence)
    return text

def cleanSentence(text, stop_words, stemmer):
    return lexiconProcess(preProcessText(text), stop_words, stemmer)

# Clean the text features
for c in [model_feature]:
    print('Text cleaning: ', c)
    updated_df[c] = [cleanSentence(item, stop_words, stemmer) for item in updated_df[c].values]

Text cleaning:  v2


In [50]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(updated_df, test_size=0.1, shuffle=True, random_state=42)

In [51]:
print(train_data.shape)
print(test_data.shape)

(5014, 2)
(558, 2)


In [52]:
updated_df[model_target].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [53]:
print('Training set shape:', train_data.shape)

print('Class ham samples in the training set:', sum(train_data[model_target] == 'ham'))
print('Class spam samples in the training set:', sum(train_data[model_target] == 'spam'))

print('')

print('Test set shape:', test_data.shape)

print('Class ham samples in the test set:', sum(test_data[model_target] == 'ham'))
print('Class spam samples in the test set:', sum(test_data[model_target] == 'spam'))


Training set shape: (5014, 2)
Class ham samples in the training set: 4334
Class spam samples in the training set: 680

Test set shape: (558, 2)
Class ham samples in the test set: 491
Class spam samples in the test set: 67


In [54]:
class_ham_no = train_data[train_data[model_target] == 'ham']
class_spam_no = train_data[train_data[model_target] == 'spam']

In [55]:
from sklearn.utils import resample
from sklearn.utils import shuffle

upsampled = resample(class_spam_no, replace=True, n_samples = 4334)
downsampled = resample(class_ham_no, replace=False, n_samples=4334)

train_data = pd.concat([downsampled, upsampled])
train_data = shuffle(train_data)

In [56]:
print('Training set shape:', train_data.shape)

print('Class ham samples in the training set:', sum(train_data[model_target] == 'ham'))
print('Class spam samples in the training set:', sum(train_data[model_target] == 'spam'))

print('')


Training set shape: (8668, 2)
Class ham samples in the training set: 4334
Class spam samples in the training set: 4334



In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

text_processor_0 = Pipeline([
    ('text_vect_0', CountVectorizer(binary=True, max_features=50)) ])

pipeline = Pipeline([
    ('data_preprocessing', text_processor_0),
    ('dt', DecisionTreeClassifier())
])    