## Retrieve Classified Data

In [1]:
import pandas as pd
import numpy as np
from data import DataLoader, DataPaths

In [2]:
# Initialize data loader
data_loader = DataLoader()
# Load spam data
spam_data = data_loader.load_data(DataPaths.SPAM_DATA)

[nltk_data] Downloading package punkt to /Users/tumi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/tumi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/tumi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Get processed data
cleaned_text = None
text, is_spam, text_class = data_loader.get_processed_data(concat=True)

In [15]:
text

array(['subject enron methanol meter follow note gave monday preliminary flow data provided daren please override pop daily volume presently zero reflect daily activity obtain gas control change needed asap economics purposes',
       'subject hpl nom january see attached file hplnol xls hplnol xls',
       'subject neon retreat ho ho ho around wonderful time year neon leaders retreat time know time year extremely hectic tough think anything past holidays life go past week december january like think minute calender handed beginning fall semester retreat scheduled weekend january youth ministers conference brad dustin connected week going change date following weekend january comes part need think think agree important us get together time recharge batteries get far spring semester lot trouble difficult us get away without kids etc brad came potential alternative get together weekend let know prefer first option would retreat similar done past several years year could go heartland coun

## Prepare Model Evaluation

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

In [11]:
def evaluate_model(model, X_test, Y_test):
    predictions = model.predict(X_test)
    Y_test_array = np.array(Y_test)
    print("Overall accuracy of model:", sum(predictions == Y_test_array) / len(predictions))
    print("Accuracy on classifying spam:", sum(np.where(Y_test_array != "ham", predictions == Y_test_array, 0)) / sum(Y_test_array != "ham"))
    print("Accuracy on identifying ham from spam:", sum(np.where(Y_test == "ham", predictions == Y_test, 0)) / sum(Y_test == "ham"))

In [12]:
def vectorize_data(ngram_range=(1, 1), variation="count"):
    if variation == "tfidf":
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(text)
    if variation == "count":
        vectorizer = CountVectorizer(ngram_range=ngram_range)
        X = vectorizer.fit_transform(text)
    return X, vectorizer

# Bag of Words (aka CountVectorizer)

In [17]:
vec_X, vectorizer = vectorize_data((1, 1), "count")
# print(vectorizer.get_feature_names_out(vec_X))
# print(vectorizer.vocabulary_)
# print(vec_X.shape)
# print(vec_X)

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(vec_X, text_class, test_size=0.25, random_state=10)

### Applying Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=0, max_iter=1000)
lr_model.fit(X_train, Y_train)
evaluate_model(lr_model, X_test, Y_test)

Overall accuracy of model: 0.9557323541411669
Accuracy on classifying spam: 0.8021164021164021
Accuracy on identifying ham from spam: 0.9924184988627748


### Applying Naive Bayes

In [13]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train.toarray(), Y_train)
evaluate_model(nb_model, X_test.toarray(), Y_test)

Overall accuracy of model: 0.7084863321093431
Accuracy on classifying spam: 0.6857142857142857
Accuracy on identifying ham from spam: 0.7139246904220369


# TF-IDF (Term Frequency Inverse Document Frequency)

In [20]:
vec_X, vectorizer = vectorize_data((1, 1), "tfidf")
# print(vectorizer.get_feature_names_out(vec_X))
# print(vectorizer.vocabulary_)
# print(vec_X.shape)
# print(vec_X)

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(vec_X, text_class, test_size=0.25, random_state=10)

### Applying Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=0, max_iter=1000)
lr_model.fit(X_train, Y_train)
evaluate_model(lr_model, X_test, Y_test)

Overall accuracy of model: 0.930640554875561
Accuracy on classifying spam: 0.656084656084656
Accuracy on identifying ham from spam: 0.9962092494313874


### Applying Naive Bayes

In [21]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train.toarray(), Y_train)
evaluate_model(nb_model, X_test.toarray(), Y_test)

Overall accuracy of model: 0.7368421052631579
Accuracy on classifying spam: 0.7132275132275132
Accuracy on identifying ham from spam: 0.7424816780389184


# word2vec

In [84]:
from gensim.models import Word2Vec
from gensim.models import FastText

# w2v_model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
# w2v_model.save("word2vec.model")

In [100]:
model = x = Word2Vec(sentences=cleaned_text, vector_size=120, window=6, 
                               min_count=5, workers=7, epochs=60)
# model = x = FastText(sentences=cleaned_text, vector_size=120, window=6, 
                            #    min_count=5, workers=7, epochs=60)

In [101]:
document_vectors = np.empty((len(cleaned_text), 120))

for i, document in enumerate(cleaned_text):
    word_vectors = []
    for word in document:
        try:
            word_vectors.append(model.wv[word])
        except KeyError:
            print(f"Word '{word}' not in vocabulary.")
    
    if word_vectors:
        document_vectors[i] = np.mean(word_vectors, axis=0)
    else:
        # Handle documents with no known words
        document_vectors[i] = np.zeros(120)

# resulting in an n * 120 -- that is, `Word2Vec:size`-- array
# document_matrix = np.concatenate(documents)

Word 'calender' not in vocabulary.
Word 'ministers' not in vocabulary.
Word 'heartland' not in vocabulary.
Word 'brenham' not in vocabulary.
Word 'brenham' not in vocabulary.
Word 'antique' not in vocabulary.
Word 'brenham' not in vocabulary.
Word 'recharging' not in vocabulary.
Word 'prevail' not in vocabulary.
Word 'complaining' not in vocabulary.
Word 'trending' not in vocabulary.
Word 'abasements' not in vocabulary.
Word 'darer' not in vocabulary.
Word 'prudently' not in vocabulary.
Word 'fortuitous' not in vocabulary.
Word 'lighthearted' not in vocabulary.
Word 'orinoco' not in vocabulary.
Word 'taster' not in vocabulary.
Word 'affluent' not in vocabulary.
Word 'pornographic' not in vocabulary.
Word 'cuvier' not in vocabulary.
Word 'irvin' not in vocabulary.
Word 'parkhouse' not in vocabulary.
Word 'blameworthy' not in vocabulary.
Word 'chlorophyll' not in vocabulary.
Word 'robed' not in vocabulary.
Word 'clears' not in vocabulary.
Word 'bayda' not in vocabulary.
Word 'inconvenien

In [88]:
len(document_vectors)

19607

In [102]:
X_train, X_test, Y_train, Y_test = train_test_split(document_vectors, text_class, test_size=0.25, random_state=10)

### Applying Logistic Regression

In [103]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=0, max_iter=1000)
lr_model.fit(X_train, Y_train)
evaluate_model(lr_model, X_test, Y_test)

Overall accuracy of model: 0.9077927376580988
Accuracy on classifying spam: 0.6592592592592592
Accuracy on identifying ham from spam: 0.9671468284053576


### Applying Naive Bayes

In [85]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train.toarray(), Y_train)
evaluate_model(nb_model, X_test.toarray(), Y_test)

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'