In [1]:
import pandas as pd
data  = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', names=["label", "message"])


In [2]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Data Cleaning and preprocessing

In [3]:
#Data Cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [5]:
corpus=[]
for i in range(0,len(data)):
  review = re.sub('[^a-zA-Z0-9]', ' ', data['message'][i])
  review = review.lower()
  review = review.split()
  a=[]
  for word in review:
    if word not in stopwords.words('english'):
      a.append(ps.stem(word))
  review = ' '.join(a)
  corpus.append(review)

In [6]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

# Model Creation using BOW(Bag of Words)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True)
X = cv.fit_transform(corpus).toarray()

In [8]:
X[1]

array([0, 0, 0, ..., 0, 0, 0])

In [9]:
y = pd.get_dummies(data['label'])
y = y.iloc[:,1].values

In [10]:
y

array([False, False,  True, ..., False, False, False])

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [12]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(x_train, y_train)

In [13]:
#prediction
y_pred = spam_detect_model.predict(x_test)

In [14]:
from sklearn.metrics import accuracy_score, classification_report
score = accuracy_score(y_test, y_pred)
print(score)
print(classification_report(y_test,y_pred))

0.9865470852017937
              precision    recall  f1-score   support

       False       0.99      0.99      0.99       955
        True       0.97      0.94      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



# Model Creating using TF-IDF

In [15]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X_tf = tv.fit_transform(corpus).toarray()

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

In [17]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(x_train, y_train)

In [18]:
y_pred = spam_detect_model.predict(x_test)

In [19]:
from sklearn.metrics import accuracy_score, classification_report
score = accuracy_score(y_test, y_pred)
print(score)
print(classification_report(y_test, y_pred))

0.9786995515695067
              precision    recall  f1-score   support

       False       0.99      0.98      0.99       781
        True       0.89      0.95      0.92       111

    accuracy                           0.98       892
   macro avg       0.94      0.96      0.95       892
weighted avg       0.98      0.98      0.98       892



In [20]:
user_input = input("Enter the mail: ")
user_input = re.sub('[^a-zA-Z0-9]', ' ', user_input)
user_input = user_input.lower()
user_input = user_input.split()
a = []
for word in user_input:
  if word not in stopwords.words('english'):
    a.append(ps.stem(word))
user_input = ' '.join(a)

Enter the mail: hi


# Word2Vec Implementation

In [21]:
!pip install gensim



In [22]:
nltk.download('wordnet')
stopword = stopwords.words('english')
stopword.remove('not')
from nltk.stem import WordNetLemmatizer
le = WordNetLemmatizer()
corpus_word2vec = []
for i in range(0, len(data)):
  review = re.sub('[^a-zA-Z0-9]', ' ', data['message'][i])
  review = review.lower()
  review = review.split()
  a = []
  for word in review:
    if word not in stopword:
      a.append(le.lemmatize(word))
  review = ' '.join(a)
  corpus_word2vec.append(review)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [23]:
corpus_word2vec

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv',
 'even brother not like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6days 16 tsandcs apply r

In [24]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [25]:
words = []
for sent in corpus_word2vec:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))
for i in range(0, 5):
  words.append(' ')

In [26]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'not', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected'

In [27]:
import gensim

In [28]:
model = gensim.models.Word2Vec(words, window=5, min_count=2)

In [29]:
model.wv.index_to_key

['call',
 'not',
 'get',
 'ur',
 'gt',
 'go',
 'lt',
 'ok',
 'free',
 'day',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'text',
 'love',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'take',
 'mobile',
 'back',
 'da',
 'dont',
 'reply',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'dear',
 'make',
 'night',
 'message',
 'well',
 'say',
 'min',
 'thing',
 'much',
 'hope',
 'great',
 'oh',
 'claim',
 'hey',
 'give',
 'number',
 'happy',
 'friend',
 'wat',
 'work',
 'yes',
 'way',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'win',
 'ask',
 'said',
 'life',
 'amp',
 'cash',
 'im',
 'yeah',
 'tone',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'last',
 'thanks',
 'uk',
 'service',
 'care',
 'com',
 'anything',
 'would',
 'year',
 'nokia',
 'also',
 'lol',
 'feel',
 'every',
 'keep',
 'sure',
 'pick',
 'urgent',
 'sent',
 'co

In [30]:
model.corpus_count


5572

In [31]:
model.wv.similar_by_word('happy')

[('amp', 0.999462366104126),
 ('make', 0.9994527697563171),
 ('thing', 0.9994526505470276),
 ('day', 0.9994474053382874),
 ('great', 0.9994207620620728),
 ('want', 0.9994158148765564),
 ('wish', 0.9994147419929504),
 ('much', 0.9994035363197327),
 ('year', 0.9993833303451538),
 ('think', 0.9993676543235779)]

In [32]:
model.wv['kid'].shape

(100,)

In [33]:
import numpy as np
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

In [34]:
!pip install tqdm



In [35]:
from tqdm import tqdm

In [36]:
words[73]

['performed']

In [37]:
X=[]
for i in tqdm(range(len(words))):
  X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5572/5572 [00:02<00:00, 1892.21it/s]


In [38]:
X

[array([-0.17619301,  0.2234361 ,  0.1973728 ,  0.07422655,  0.07428159,
        -0.43410268,  0.13646574,  0.5615178 , -0.21362066, -0.20256175,
        -0.08063753, -0.3786578 , -0.04293318,  0.10817673,  0.10147076,
        -0.19641866,  0.06510233, -0.3642354 , -0.05444303, -0.45683047,
         0.06727951,  0.07566767,  0.05052672, -0.12573563, -0.053627  ,
         0.0623174 , -0.24600528, -0.11522461, -0.22412634,  0.05922028,
         0.19833504,  0.02349463,  0.02691493, -0.07521723, -0.09325298,
         0.33586755,  0.04614584, -0.20773166, -0.0924049 , -0.46851984,
        -0.03003119, -0.21912654, -0.06310266,  0.06396685,  0.29021713,
        -0.07716728, -0.19206418, -0.01843518,  0.09831345,  0.21597274,
         0.1522805 , -0.23923863, -0.06443602, -0.01817761, -0.16226332,
         0.12759994,  0.19624984, -0.04054961, -0.21433842,  0.07352813,
         0.10371679,  0.09918035, -0.02669909, -0.03670448, -0.32131752,
         0.20181538,  0.11138573,  0.19494212, -0.3

In [39]:
X_new=np.array(X, dtype="object")

In [40]:
X_new[0]

array([-0.17619301,  0.2234361 ,  0.1973728 ,  0.07422655,  0.07428159,
       -0.43410268,  0.13646574,  0.5615178 , -0.21362066, -0.20256175,
       -0.08063753, -0.3786578 , -0.04293318,  0.10817673,  0.10147076,
       -0.19641866,  0.06510233, -0.3642354 , -0.05444303, -0.45683047,
        0.06727951,  0.07566767,  0.05052672, -0.12573563, -0.053627  ,
        0.0623174 , -0.24600528, -0.11522461, -0.22412634,  0.05922028,
        0.19833504,  0.02349463,  0.02691493, -0.07521723, -0.09325298,
        0.33586755,  0.04614584, -0.20773166, -0.0924049 , -0.46851984,
       -0.03003119, -0.21912654, -0.06310266,  0.06396685,  0.29021713,
       -0.07716728, -0.19206418, -0.01843518,  0.09831345,  0.21597274,
        0.1522805 , -0.23923863, -0.06443602, -0.01817761, -0.16226332,
        0.12759994,  0.19624984, -0.04054961, -0.21433842,  0.07352813,
        0.10371679,  0.09918035, -0.02669909, -0.03670448, -0.32131752,
        0.20181538,  0.11138573,  0.19494212, -0.35737732,  0.35

In [51]:
X_new.shape, y.shape, len(data), len(corpus_word2vec), len(words)

((5572,), (5572,), 5572, 5572, 5572)

In [56]:
X_new = np.array(X_new)

In [57]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=0)

In [58]:
from sklearn.ensemble import RandomForestClassifier
spam_detect_model = RandomForestClassifier().fit(x_train, y_train)

ValueError: setting an array element with a sequence.