<a href="https://colab.research.google.com/github/Speedbird45Bravo/rando_projects/blob/main/NLTK_51221.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [311]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from string import punctuation
import gensim
import gensim.downloader as api
import nltk
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential
stopwords = nltk.corpus.stopwords.words('english')

In [312]:
text = pd.read_csv("spam.csv", encoding="latin-1")
text = text[['v1','v2']]
text.columns = ['label','text']

In [313]:
def cleaner(text):
    text = "".join([char for char in text if char not in punctuation])
    tokens = re.split("\W+", text.lower())
    text = [word for word in tokens if word not in stopwords]
    return text

In [314]:
tfidf = TfidfVectorizer(analyzer=cleaner)

In [315]:
X = tfidf.fit_transform(text['text'])

In [316]:
X = pd.DataFrame(X.toarray())
y = text[['label']].copy()

In [317]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [318]:
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train.values.ravel())

In [319]:
predictions = rf.predict(X_test)

In [320]:
pos = "ham"
precision = precision_score(y_test, predictions, pos_label=pos).round(3)
recall = recall_score(y_test, predictions, pos_label=pos).round(3)

In [321]:
print("Precision: {} | Recall: {}".format(precision, recall))

Precision: 0.968 | Recall: 1.0


In [322]:
wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [323]:
wiki_embeddings.most_similar('maine')

[('vermont', 0.8391201496124268),
 ('connecticut', 0.7976082563400269),
 ('oregon', 0.7730393409729004),
 ('massachusetts', 0.7728150486946106),
 ('missouri', 0.7665733098983765),
 ('virginia', 0.7642097473144531),
 ('rhode', 0.7578915357589722),
 ('carolina', 0.756413996219635),
 ('delaware', 0.7552822828292847),
 ('wisconsin', 0.7548272609710693)]

In [324]:
text['text_clean'] = text['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(text['text_clean'], text['label'])

In [325]:
w2v = gensim.models.Word2Vec(X_train, size=100, window=5, min_count=2)

In [327]:
w2v.wv.most_similar('house')

[('up', 0.999786913394928),
 ('no', 0.9997838735580444),
 ('msg', 0.9997835159301758),
 ('as', 0.9997766613960266),
 ('any', 0.9997725486755371),
 ('get', 0.9997689723968506),
 ('by', 0.9997686147689819),
 ('here', 0.9997678995132446),
 ('wait', 0.9997670650482178),
 ('its', 0.9997647404670715)]

In [329]:
%%capture 
w2v_vect = np.array([np.array([w2v.wv[i] for i in ls if i in w2v.wv.index2word]) for ls in X_test])

In [330]:
w2v_vect_avg = []
for vect in w2v_vect: 
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [332]:
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

In [333]:
d2v = gensim.models.Doc2Vec(tagged_docs, vector_size=100, window=5, min_count=2)

In [335]:
d2v.infer_vector(['king','queen'])

array([-2.19391007e-02,  2.73283347e-02, -4.02135076e-03, -1.31881507e-02,
       -2.95545477e-02,  4.33455408e-03, -8.49400647e-03, -2.36688135e-03,
        1.57928478e-03,  1.40861226e-02,  8.60079763e-06,  1.94708128e-02,
       -4.81459545e-03,  4.57884930e-03,  7.86823686e-03, -9.82737169e-03,
        1.33067612e-02, -2.69060265e-02, -4.09757718e-03, -4.93764644e-03,
        1.08793397e-02,  8.98676552e-03,  1.04231425e-02,  6.33892021e-04,
        1.00570275e-02, -1.26737645e-02, -1.15146190e-02, -2.36522127e-02,
       -3.26715559e-02, -2.34243483e-03,  5.28964680e-03, -1.20118661e-02,
       -6.83485018e-03, -3.35601419e-02,  9.03923344e-03, -2.34689545e-02,
        8.46854132e-03,  1.62315574e-02, -1.62493624e-02,  1.54244630e-02,
       -3.93188279e-03, -3.24654356e-02, -1.35524096e-02, -1.87780382e-03,
        2.18140730e-03, -7.50894193e-03,  1.15650902e-02,  9.72089916e-03,
        8.52358062e-03,  8.49572197e-03, -1.19118821e-02, -6.37604250e-03,
        2.47242441e-03, -

In [336]:
vectors = [[d2v.infer_vector(words)] for words in X_test]

In [337]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [338]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [339]:
X_train_sequences_padded = pad_sequences(X_train_sequences, 50)
X_test_sequences_padded = pad_sequences(X_test_sequences, 50)

In [340]:
y_train = y_train.replace("ham",1)
y_train = y_train.replace("spam",0)

In [341]:
y_test = y_test.replace("ham",1)
y_test = y_test.replace("spam",0)

In [342]:
def recall_score(y_true, y_predictions):
  true_positives = K.sum(K.round(K.clip(y_true * y_predictions, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  recall = true_positives / (possible_positives + K.epsilon())
  return recall

In [343]:

def precision_score(y_true, y_predictions):
  true_positives = K.sum(K.round(K.clip(y_true * y_predictions, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_predictions, 0, 1)))
  recall = true_positives / (predicted_positives + K.epsilon())
  return recall

In [344]:
model = Sequential()
model.add(Embedding(len(tokenizer.index_word)+1, 32))
model.add(LSTM(32, dropout=0, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 32)          211072    
_________________________________________________________________
lstm_7 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_14 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 33        
Total params: 220,481
Trainable params: 220,481
Non-trainable params: 0
_________________________________________________________________


In [345]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', recall_score, precision_score])

In [346]:
history = model.fit(X_train_sequences_padded, y_train, batch_size=32, epochs=10, validation_data=(X_test_sequences_padded, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [347]:
predictions = model.predict(X_test_sequences_padded).round(0)

In [361]:
loss, accuracy, precision, recall = model.evaluate(y_test, predictions)
loss = np.round(loss, 2) * 100
accuracy = np.round(accuracy, 2) * 100
precision = np.round(precision, 2) * 100
recall = np.round(recall, 2) * 100
print("Test Accuracy: {}% | Test Precision: {}% | Test Recall: {}%".format(accuracy, precision, recall))

Test Accuracy: 90.0% | Test Precision: 100.0% | Test Recall: 90.0%
