Datasets/spam classification/SMSSpamCollection.txt

In [1]:
import numpy as np
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('Datasets/spam classification/SMSSpamCollection.txt',
			sep='	',names=['label','massages'])

In [3]:
df.head()

Unnamed: 0,label,massages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [5]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
all_common_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Tafheem
[nltk_data]     Ahemad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Preprocessing

lemmatizer=WordNetLemmatizer()

corpus=[]
for sen in df['massages']:
	sen=sen.lower()
	sen=re.sub('[^a-z A-z]','',sen)
	words=word_tokenize(sen)
	words=[word for word in words if word not in all_common_words]
	words=[lemmatizer.lemmatize(word) for word in words]

	sen=" ".join(words)
	corpus.append(sen)

In [7]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply over',
 'u dun say early hor u c already say',
 'nah dont think go usf life around though',
 'freemsg hey darling week word back id like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'im gon na home soon dont want talk stuff anymore tonight k ive cried enough today',
 'six chance win cash pound txt csh send cost pday day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim tc wwwdbukn

In [8]:
# means some lines have zero lengh

for i in range(len(corpus)):
	if(len(corpus[i])==0) :
		print(i)

960
1612
2807
3376
4575
4824


In [9]:
senLengh=[len(sen)>0 for sen in corpus]

In [10]:
df=df[senLengh]

In [11]:
corpus=[sen for sen in corpus if len(sen)>0]

In [12]:
df['label']=df['label'].apply(lambda x:1 if x=='spam' else 0)

In [13]:
for i in range(len(corpus)):
	words=word_tokenize(corpus[i])
	corpus[i]=words

In [27]:
corpus

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'ratetcs',
  'apply',
  'over'],
 ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say'],
 ['nah', 'dont', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',

In [15]:
from gensim.models import Word2Vec
model=Word2Vec(sentences=corpus,vector_size=60,min_count=3)

In [16]:
def avg_word2vec(doc):
	vectors=[model.wv[word] for word in doc if word in model.wv]
	if(not vectors):
		return np.zeros(model.vector_size)
	return np.mean(vectors,axis=0)

word_vectors=[avg_word2vec(doc) for doc in corpus]

In [17]:
type(word_vectors)

list

In [18]:
X=pd.DataFrame(word_vectors)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,0.121667,0.318205,0.019179,-0.018505,-0.292001,-0.046217,0.424812,0.600286,-0.001508,-0.013951,...,-0.119606,0.197614,0.145159,0.367555,0.193659,0.4581,0.044099,-0.048602,0.499755,-0.477886
1,0.101096,0.252143,0.019915,-0.005415,-0.239388,-0.045361,0.340536,0.483929,-0.005503,-0.014351,...,-0.094186,0.16415,0.112272,0.298364,0.15464,0.36761,0.036759,-0.040392,0.404224,-0.384011
2,0.090707,0.26415,0.018801,-0.00847,-0.26563,-0.045285,0.349339,0.504552,-0.009308,-0.011326,...,-0.110256,0.159326,0.116344,0.301067,0.170564,0.375448,0.040281,-0.040304,0.417078,-0.416506
3,0.190415,0.467113,0.037199,-0.01793,-0.425045,-0.080614,0.624818,0.894755,-0.006738,-0.016953,...,-0.183185,0.292918,0.206773,0.552088,0.295882,0.678442,0.062696,-0.088484,0.74017,-0.696764
4,0.138172,0.349354,0.018111,-0.021338,-0.318954,-0.062097,0.46597,0.671555,0.001257,-0.027754,...,-0.132725,0.208436,0.157633,0.407893,0.21088,0.506079,0.046217,-0.056187,0.555161,-0.525158


In [19]:
Y=df['label']

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=.22,random_state=15)

In [21]:
from sklearn.ensemble import RandomForestClassifier
random=RandomForestClassifier(n_estimators=2)

In [24]:
random.fit(X_train,Y_train)

In [25]:
Y_pred=random.predict(X_test)

In [26]:
from sklearn.metrics import classification_report,accuracy_score

print(classification_report(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1055
           1       0.83      0.54      0.65       170

    accuracy                           0.92      1225
   macro avg       0.88      0.76      0.81      1225
weighted avg       0.92      0.92      0.91      1225

0.9208163265306123
