In [1]:
import pandas as pd


In [2]:
#loading the phishing URLs data to dataframe
data0 = pd.read_csv('/Users/HP/projectML/email_pre_processed.csv')
data0.head()

Unnamed: 0,text,spam
0,subject naturally irresistible corporate ident...,1
1,subject stock trading gunslinger fanny merrill...,1
2,subject unbelievable new home made easy im wan...,1
3,subject color printing special request additio...,1
4,subject money get software cd software compati...,1


In [3]:
x_tokenized = [[w for w in sentence.split(" ") if w != ""] for sentence in data0['text']]
x_tokenized[0]

['subject',
 'naturally',
 'irresistible',
 'corporate',
 'identity',
 'lt',
 'really',
 'hard',
 'recollect',
 'company',
 'market',
 'full',
 'suqgestions',
 'information',
 'isoverwhelminq',
 'good',
 'catchy',
 'logo',
 'stylish',
 'statlonery',
 'outstanding',
 'website',
 'make',
 'task',
 'much',
 'easier',
 'promise',
 'havinq',
 'ordered',
 'iogo',
 'company',
 'automaticaily',
 'become',
 'world',
 'ieader',
 'isguite',
 'ciear',
 'without',
 'good',
 'product',
 'effective',
 'business',
 'organization',
 'practicable',
 'aim',
 'hotat',
 'nowadays',
 'market',
 'promise',
 'marketing',
 'effort',
 'become',
 'much',
 'effective',
 'list',
 'clear',
 'benefit',
 'creativeness',
 'hand',
 'made',
 'original',
 'logo',
 'specially',
 'done',
 'reflect',
 'distinctive',
 'company',
 'image',
 'convenience',
 'logo',
 'stationery',
 'provided',
 'format',
 'easy',
 'use',
 'content',
 'management',
 'system',
 'letsyou',
 'change',
 'website',
 'content',
 'even',
 'structure',


In [4]:
import time
import gensim
from gensim.models import Word2Vec
start = time.time()

model = gensim.models.Word2Vec(x_tokenized,
                 vector_size=100
                 # Size is the length of our vector.
                )

end = round(time.time()-start,2)
print("This process took",end,"seconds.")

This process took 5.39 seconds.


In [5]:
model.wv.most_similar("free")

[('risknews', 0.7928518056869507),
 ('secure', 0.7576252818107605),
 ('ad', 0.7198717594146729),
 ('membership', 0.6988348364830017),
 ('feel', 0.6977812647819519),
 ('online', 0.6929686069488525),
 ('quote', 0.6859177947044373),
 ('simply', 0.6834090948104858),
 ('anywhere', 0.6743285655975342),
 ('photo', 0.6704782843589783)]

In [6]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(100,))
        
        return np.asarray(vec).flatten()


In [7]:
sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 1200,
              seq_len = 14,
              embedding_matrix = model.wv
             )

In [9]:
import numpy as np
test_vec = sequencer.textToVector("i am in love with you hihi")
test_vec

array([0.16200484, 0.18967248, 0.1953795 , ..., 0.        , 0.        ,
       0.        ])

In [10]:
test_vec.shape


(1400,)

In [11]:
# But before creating a PCA model using scikit-learn let's create
# vectors for our each vector
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_tokenized])
print(x_vecs.shape)

(5728, 1400)


In [15]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=450)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9758031316375915


In [16]:
x_comps = pca_model.transform(x_vecs)
x_comps.shape

(5728, 450)

In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_comps,data0['spam'],test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4582, 450)
(1146, 450)
(4582,)
(1146,)


In [19]:
from sklearn.preprocessing import MinMaxScaler 
scaler = MinMaxScaler()
# print(scaler.fit(x_test_transformed))
print(scaler.fit(np.concatenate((x_train,x_test))))

print(scaler.data_max_)
print(scaler.transform(x_train))
print(scaler.transform(x_test))
X_train_W2vec=scaler.transform(x_train)
X_test_W2vec=scaler.transform(x_test)

MinMaxScaler()
[27.51257224 14.8011383  15.8455907  13.86202805 14.29259422 12.58947582
 12.08435727 10.49635175 13.10434788  9.15657223 12.35452698  8.61636303
 10.1328882  10.3399089   9.34028853 10.33782316 10.25686112  9.63999192
  9.20841063  8.60149524 10.31189501  8.55030495 10.04731482  8.09865047
  9.09187612  8.49312012  9.23771801  9.20597312  8.90705404  8.58986641
  8.03627818  7.39375621  7.90903841  6.88133377  7.18894353  7.52427968
  8.22119103  7.84243795  6.91080572  6.92455327  7.26772472  8.50184092
  7.0382852   7.42242725  7.91002399  7.07094703  7.27690322  6.79484932
  7.4267165   8.36492131  7.83773774  6.81361003  7.86975835  6.87551439
  6.32006311  6.22967286  7.23142877  6.92833236  7.27849723  6.51171459
  6.40029019  7.00739842  6.78308599  6.70227399  6.75893365  6.68353769
  7.01058363  5.73304837  6.29668109  5.89291656  5.99304857  6.32318231
  6.71648815  6.28403391  5.95105827  6.79284674  5.88008109  7.32061084
  6.65344075  6.49511834  6.15444397

In [26]:
from sklearn import svm
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
param_grid_SVM = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}


gs_svm = GridSearchCV(SVC(),param_grid_SVM,cv=5) 
gs_svm.fit(X_train_W2vec,y_train)


In [27]:
gs_svm.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

In [28]:
gs_svm.best_score_

0.9244901020510208

In [30]:
from sklearn.svm import SVC
#Create a svm Classifier
clf = svm.SVC(C=10, gamma=0.1,kernel='rbf') 

#Train the model using the training sets
clf.fit(X_train_W2vec,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test_W2vec)


In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_pred,y_test)))
print('Precision score: ', format(precision_score(y_pred,y_test)))
print('Recall score: ', format(recall_score(y_pred,y_test)))
print('F1 score: ', format(f1_score(y_pred,y_test))) 

Accuracy score:  0.931064572425829
Precision score:  0.8172413793103448
Recall score:  0.9011406844106464
F1 score:  0.8571428571428571


In [23]:
print(np.amin(X_train_W2vec))
print(np.amax(X_train_W2vec))

0.0
1.0000000000000002


In [24]:
naive_bayes = GaussianNB()
naive_bayes.fit(X_train_W2vec,y_train)
predictions = naive_bayes.predict(X_test_W2vec)
predictions

array([0, 1, 0, ..., 1, 1, 0], dtype=int64)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(predictions,y_test)))
print('Precision score: ', format(precision_score(predictions,y_test)))
print('Recall score: ', format(recall_score(predictions,y_test)))
print('F1 score: ', format(f1_score(predictions,y_test)))

Accuracy score:  0.6972076788830716
Precision score:  0.6551724137931034
Recall score:  0.43478260869565216
F1 score:  0.5226960110041267


In [34]:
from sklearn.neighbors import KNeighborsClassifier
param_grid = {'n_neighbors': list(range(1,9)),'weights' : ['uniform','distance'],'metric' : ['minkowski','euclidean','manhattan'] }
gs_knn = GridSearchCV(KNeighborsClassifier(),param_grid,cv=5)
gs_knn.fit(X_train_W2vec,y_train)

In [35]:
gs_knn.best_score_

0.732658945774383

In [36]:
gs_knn.best_params_

{'metric': 'minkowski', 'n_neighbors': 2, 'weights': 'uniform'}

In [46]:
knn = KNeighborsClassifier(n_neighbors = 2, weights = 'uniform',metric = 'minkowski')
knn.fit(X_train_W2vec,y_train) 
y_pred_knn = knn.predict(X_test_W2vec) 

In [47]:
print('Accuracy score: ', format(accuracy_score(y_pred_knn,y_test)))
print('Precision score: ', format(precision_score(y_pred_knn,y_test)))
print('Recall score: ', format(recall_score(y_pred_knn,y_test)))
print('F1 score: ', format(f1_score(y_pred_knn,y_test))) 

Accuracy score:  0.7495636998254799
Precision score:  0.7586206896551724
Recall score:  0.5034324942791762
F1 score:  0.6052269601100413


In [48]:
from sklearn.ensemble import RandomForestClassifier
param_grid_RF = {'n_estimators': [25, 50, 100, 150],'max_features': ['sqrt', 'log2', None],'max_depth': [3, 6, 9],'max_leaf_nodes': [3, 6, 9],}
gs_rf = GridSearchCV(RandomForestClassifier(),param_grid_RF,cv=5)
gs_rf.fit(X_train_W2vec,y_train)

In [49]:
gs_rf.best_score_

0.8745084359954856

In [50]:
gs_rf.best_params_

{'max_depth': 6,
 'max_features': None,
 'max_leaf_nodes': 9,
 'n_estimators': 100}

In [51]:
rf = RandomForestClassifier(n_estimators=100,max_depth=6,max_leaf_nodes=9,max_features=None,n_jobs=-1)
rf_model = rf.fit(X_train_W2vec,y_train)
y_pred_rf=rf_model.predict(X_test_W2vec)

In [None]:
print('Accuracy score: ', format(accuracy_score(y_pred_rf,y_test))) 
print('Precision score: ', format(precision_score(y_pred_rf,y_test)))
print('Recall score: ', format(recall_score(y_pred_rf,y_test)))
print('F1 score: ', format(f1_score(y_pred_rf,y_test))) 