Bag of  words (Spam/Ham)

In [5]:
!pip install chardet
import chardet
with open("spam.csv", "rb") as f:
    raw_data = f.read(100000)  # Read a large enough sample
    result = chardet.detect(raw_data)

print("Detected encoding:", result["encoding"])

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
Installing collected packages: chardet
Successfully installed chardet-5.2.0
Detected encoding: Windows-1252


In [8]:
import pandas

dataset = pandas.read_csv("spam.csv",names=["label","message"], encoding='Windows-1252', on_bad_lines='skip')

In [9]:
dataset

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5517,spam,This is the 2nd time we have tried 2 contact u...
5518,ham,Will ï¿½_ b going to esplanade fr home?
5519,ham,"Pity, * was in mood for that. So...any other s..."
5520,ham,The guy did some bitching but I acted like i'd...


Data Cleaning and Processing

In [10]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer



In [11]:
len(dataset)

5522

In [13]:
dataset['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [24]:
tokenized_messages = []
stemmer = PorterStemmer()
for i in range(len(dataset)):
    message = dataset['message'][i].lower()
    message = re.sub('[^a-zA-Z\\s]','', message)
    words_in_message = message.split()
    stemmed_words = [stemmer.stem(word) for word in words_in_message if not word in stopwords.words('english')]
    message = '  '.join(stemmed_words)
    tokenized_messages.append(message)

tokenized_messages

['go  jurong  point  crazi  avail  bugi  n  great  world  la  e  buffet  cine  got  amor  wat',
 'ok  lar  joke  wif  u  oni',
 'free  entri  wkli  comp  win  fa  cup  final  tkt  st  may  text  fa  receiv  entri  questionstd  txt  ratetc  appli  over',
 'u  dun  say  earli  hor  u  c  alreadi  say',
 'nah  dont  think  goe  usf  live  around  though',
 'freemsg  hey  darl  week  word  back  id  like  fun  still  tb  ok  xxx  std  chg  send  rcv',
 'even  brother  like  speak  treat  like  aid  patent',
 'per  request  mell  mell  oru  minnaminungint  nurungu  vettam  set  callertun  caller  press  copi  friend  callertun',
 'winner  valu  network  custom  select  receivea  prize  reward  claim  call  claim  code  kl  valid  hour',
 'mobil  month  u  r  entitl  updat  latest  colour  mobil  camera  free  call  mobil  updat  co  free',
 'im  gonna  home  soon  dont  want  talk  stuff  anymor  tonight  k  ive  cri  enough  today',
 'six  chanc  win  cash  pound  txt  csh  send  cost  pda

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500, binary=False)

X = cv.fit_transform(tokenized_messages)

X.toarray()
cv.vocabulary_


{'go': np.int64(883),
 'point': np.int64(1617),
 'crazi': np.int64(450),
 'avail': np.int64(142),
 'bugi': np.int64(274),
 'great': np.int64(918),
 'world': np.int64(2434),
 'la': np.int64(1162),
 'cine': np.int64(370),
 'got': np.int64(904),
 'wat': np.int64(2366),
 'ok': np.int64(1477),
 'lar': np.int64(1171),
 'joke': np.int64(1123),
 'wif': np.int64(2404),
 'oni': np.int64(1487),
 'free': np.int64(791),
 'entri': np.int64(620),
 'wkli': np.int64(2423),
 'comp': np.int64(405),
 'win': np.int64(2409),
 'fa': np.int64(683),
 'cup': np.int64(463),
 'final': np.int64(734),
 'tkt': np.int64(2210),
 'st': np.int64(2064),
 'may': np.int64(1309),
 'text': np.int64(2171),
 'receiv': np.int64(1777),
 'questionstd': np.int64(1728),
 'txt': np.int64(2268),
 'ratetc': np.int64(1754),
 'appli': np.int64(100),
 'dun': np.int64(579),
 'say': np.int64(1898),
 'earli': np.int64(583),
 'alreadi': np.int64(67),
 'nah': np.int64(1410),
 'dont': np.int64(559),
 'think': np.int64(2185),
 'goe': np.int64(8

Using Lemitization

In [29]:
from nltk import WordNetLemmatizer
tokenized_messages = []
lemmatizer = WordNetLemmatizer()
for i in range(len(dataset)):
    message = dataset['message'][i].lower()
    message = re.sub('[^a-zA-Z\\s]','', message)
    words_in_message = message.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words_in_message if not word in stopwords.words('english')]
    message = '  '.join(lemmatized_words)
    tokenized_messages.append(message)

tokenized_messages

['go  jurong  point  crazy  available  bugis  n  great  world  la  e  buffet  cine  got  amore  wat',
 'ok  lar  joking  wif  u  oni',
 'free  entry  wkly  comp  win  fa  cup  final  tkts  st  may  text  fa  receive  entry  questionstd  txt  ratetcs  apply  over',
 'u  dun  say  early  hor  u  c  already  say',
 'nah  dont  think  go  usf  life  around  though',
 'freemsg  hey  darling  week  word  back  id  like  fun  still  tb  ok  xxx  std  chgs  send  rcv',
 'even  brother  like  speak  treat  like  aid  patent',
 'per  request  melle  melle  oru  minnaminunginte  nurungu  vettam  set  callertune  caller  press  copy  friend  callertune',
 'winner  valued  network  customer  selected  receivea  prize  reward  claim  call  claim  code  kl  valid  hour',
 'mobile  month  u  r  entitled  update  latest  colour  mobile  camera  free  call  mobile  update  co  free',
 'im  gonna  home  soon  dont  want  talk  stuff  anymore  tonight  k  ive  cried  enough  today',
 'six  chance  win  ca

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500, binary=True)

X = cv.fit_transform(tokenized_messages)

cv.vocabulary_

{'go': np.int64(877),
 'point': np.int64(1623),
 'crazy': np.int64(489),
 'available': np.int64(144),
 'bugis': np.int64(280),
 'great': np.int64(901),
 'world': np.int64(2436),
 'la': np.int64(1146),
 'buffet': np.int64(279),
 'cine': np.int64(407),
 'got': np.int64(893),
 'wat': np.int64(2361),
 'ok': np.int64(1478),
 'lar': np.int64(1155),
 'joking': np.int64(1098),
 'wif': np.int64(2401),
 'oni': np.int64(1488),
 'free': np.int64(809),
 'entry': np.int64(666),
 'wkly': np.int64(2422),
 'comp': np.int64(443),
 'win': np.int64(2406),
 'cup': np.int64(501),
 'final': np.int64(751),
 'tkts': np.int64(2197),
 'st': np.int64(2021),
 'may': np.int64(1304),
 'text': np.int64(2149),
 'receive': np.int64(1730),
 'txt': np.int64(2257),
 'apply': np.int64(100),
 'dun': np.int64(625),
 'say': np.int64(1832),
 'early': np.int64(630),
 'already': np.int64(66),
 'nah': np.int64(1408),
 'dont': np.int64(602),
 'think': np.int64(2171),
 'usf': np.int64(2298),
 'life': np.int64(1195),
 'around': np.i