In [1]:
import nltk 
import pandas as pd

In [2]:
corpus = pd.read_csv('spam.csv', encoding='latin')

In [3]:
corpus

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
corpus.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1,inplace=True)

In [5]:
corpus.rename(columns={"v1":'label',"v2":'message'},inplace=True)

In [6]:
corpus

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [8]:
wnl = WordNetLemmatizer()

In [9]:
messages = []
for i in range(0,len(corpus)):
    review = re.sub('[^a-zA-Z]'," ",corpus["message"][i])
    review = review.lower()
    review = review.split()
    review = [wnl.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    messages.append(review)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cv = CountVectorizer(max_features=100,binary=True)
X = cv.fit_transform(messages).toarray()

In [13]:
X.shape

(5572, 100)

In [14]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
cv.vocabulary_

{'go': 23,
 'great': 27,
 'got': 26,
 'wat': 90,
 'ok': 59,
 'free': 19,
 'win': 94,
 'text': 79,
 'txt': 86,
 'say': 70,
 'already': 0,
 'think': 82,
 'life': 39,
 'hey': 30,
 'week': 92,
 'back': 5,
 'like': 40,
 'still': 75,
 'send': 72,
 'friend': 20,
 'prize': 65,
 'claim': 9,
 'call': 6,
 'mobile': 50,
 'co': 10,
 'home': 32,
 'want': 89,
 'today': 84,
 'cash': 8,
 'day': 15,
 'reply': 67,
 'www': 96,
 'right': 68,
 'take': 77,
 'time': 83,
 'message': 47,
 'com': 11,
 'oh': 58,
 'yes': 99,
 'make': 45,
 'way': 91,
 'dont': 17,
 'miss': 49,
 'ur': 88,
 'going': 24,
 'da': 14,
 'lor': 42,
 'meet': 46,
 'really': 66,
 'know': 35,
 'lol': 41,
 'love': 43,
 'let': 38,
 'work': 95,
 'yeah': 97,
 'tell': 78,
 'anything': 2,
 'thanks': 80,
 'uk': 87,
 'please': 63,
 'msg': 52,
 'see': 71,
 'pls': 64,
 'need': 54,
 'tomorrow': 85,
 'hope': 33,
 'well': 93,
 'lt': 44,
 'gt': 28,
 'get': 21,
 'ask': 3,
 'morning': 51,
 'happy': 29,
 'sorry': 74,
 'give': 22,
 'new': 55,
 'find': 18,
 'year

In [16]:
#create bag of words using the n_gram:
cv_gram = CountVectorizer(max_features=500,binary=True,ngram_range=(2,3))
X_gram = cv_gram.fit_transform(messages).toarray()
X_gram

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
cv_gram.vocabulary_

{'free entry': 142,
 'rate apply': 344,
 'per request': 305,
 'claim call': 61,
 'call claim': 26,
 'claim code': 62,
 'call claim code': 27,
 'update latest': 445,
 'latest colour': 223,
 'free call': 139,
 'call mobile': 38,
 'entitled update latest': 125,
 'chance win': 59,
 'win cash': 486,
 'chance win cash': 60,
 'txt word': 438,
 'dont miss': 115,
 'let know': 226,
 'feel like': 133,
 'go home': 160,
 'anything lor': 5,
 'call reply': 44,
 'mobile free': 269,
 'free camcorder': 140,
 'please call': 313,
 'delivery tomorrow': 109,
 'lt gt': 242,
 'missed call': 266,
 'want go': 474,
 'first time': 138,
 'like lt': 229,
 'like lt gt': 230,
 'sm ac': 376,
 'bx ip': 24,
 'sorry call': 380,
 'call later': 36,
 'later meeting': 221,
 'sorry call later': 381,
 'awarded bonus': 14,
 'prize call': 337,
 'ur awarded': 446,
 'call free': 30,
 'thats cool': 418,
 'hi hi': 195,
 'call customer': 28,
 'customer service': 97,
 'service representative': 369,
 'guaranteed cash': 180,
 'cash priz

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=100,ngram_range=(2,2))
X_tfidf = tfidf.fit_transform(messages).toarray()
X_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
tfidf.vocabulary_

{'free entry': 32,
 'claim call': 16,
 'call claim': 3,
 'free call': 31,
 'chance win': 15,
 'txt word': 89,
 'let know': 53,
 'please call': 67,
 'lt gt': 57,
 'want go': 97,
 'like lt': 54,
 'sm ac': 79,
 'sorry call': 80,
 'call later': 8,
 'ur awarded': 90,
 'hi hi': 45,
 'call customer': 4,
 'customer service': 23,
 'cash prize': 14,
 'trying contact': 86,
 'draw show': 28,
 'show prize': 78,
 'prize guaranteed': 73,
 'guaranteed call': 41,
 'valid hr': 95,
 'selected receive': 75,
 'private account': 71,
 'account statement': 0,
 'statement show': 81,
 'call identifier': 5,
 'identifier code': 49,
 'code expires': 20,
 'urgent mobile': 94,
 'call landline': 7,
 'wat time': 98,
 'ur mob': 93,
 'gud ni': 42,
 'new year': 62,
 'send stop': 77,
 'co uk': 19,
 'nice day': 63,
 'lt decimal': 56,
 'decimal gt': 25,
 'txt nokia': 87,
 'good morning': 36,
 'ur friend': 92,
 'good night': 37,
 'camcorder reply': 12,
 'reply call': 74,
 'po box': 69,
 'last night': 52,
 'camera phone': 13,