In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv('spam_ham_dataset.csv')
messages.head()

Unnamed: 0,Label,Message
0,spam,Congratulations! You've won a free ticket to t...
1,ham,"Hey, are we still on for lunch tomorrow?"
2,spam,You have been selected to receive a $1000 Walm...
3,ham,Can you send me the report by 5 PM today?
4,spam,Exclusive offer! Get 50% off on your next purc...


### Data Cleaning and Preprocessing

In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
import re

In [5]:
lemma = WordNetLemmatizer()

In [7]:
corpus=[]
for i in range(len(messages)):
    words = re.sub("[^a-zA-Z]"," ",messages['Message'][i]) #since ^ is used inside character class [] , it will negate everything
    words = words.lower()
    words = words.split()
    words = [lemma.lemmatize(word) for word in words if word not in stopwords.words('english')]
    words = " ".join(words)
    corpus.append(words)

In [8]:
corpus

['congratulation free ticket bahamas call claim prize',
 'hey still lunch tomorrow',
 'selected receive walmart gift card click claim',
 'send report pm today',
 'exclusive offer get next purchase limited time',
 'forget meeting tomorrow',
 'urgent account compromised click link secure',
 'happy birthday hope great day',
 'win brand new iphone participate survey get chance win',
 'help presentation next meeting']

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tf_idf = TfidfVectorizer(max_features=30)

In [12]:
X=tf_idf.fit_transform(corpus).toarray()
X

array([[0.        , 0.5182909 , 0.        , 0.        , 0.44059462,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.5182909 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.5182909 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.76190497, 0.        , 0.        , 0.        ,
        0.        , 0.64768883, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.40319445,
        0.40319445, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.

In [13]:
tf_idf.vocabulary_

{'ticket': 23,
 'bahamas': 1,
 'claim': 4,
 'prize': 14,
 'still': 21,
 'tomorrow': 26,
 'selected': 19,
 'receive': 16,
 'walmart': 28,
 'click': 5,
 'send': 20,
 'report': 17,
 'pm': 12,
 'today': 25,
 'offer': 10,
 'get': 6,
 'next': 9,
 'purchase': 15,
 'time': 24,
 'meeting': 7,
 'urgent': 27,
 'account': 0,
 'secure': 18,
 'birthday': 2,
 'win': 29,
 'brand': 3,
 'new': 8,
 'participate': 11,
 'survey': 22,
 'presentation': 13}