In [2]:
from pprint import pprint

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt


import prepare

Let's go over a quick example of determining the Frequency of Words in a document. 
A simple concept, but it's import to get a clear understanding of what's going on before you can move on to the next topic. tf idf

In [3]:
document = 'Mary had a little lamb, a little lamb, a little lamb.'

We have a simple doc. Mary had a little lamb

Let's turn that string into a pandas series where each row is a word in our document

In [5]:
document = document.lower().replace(',', '').replace('.', '')
words = pd.Series(document.split())


In [6]:
words

0       mary
1        had
2          a
3     little
4       lamb
5          a
6     little
7       lamb
8          a
9     little
10      lamb
dtype: object

Now let's get the Value counts of each of those words and put them into a Pandas DataFrame

In [12]:
df = pd.DataFrame({'raw_count': words.value_counts()})
df

Unnamed: 0,raw_count
little,3
lamb,3
a,3
had,1
mary,1


Let's figure out the Frequency of these words in our "article"

In [15]:
df = df.assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
df = df.assign(augmented_frequency=lambda df: df.frequency / df.frequency.max())

In [16]:
df

Unnamed: 0,raw_count,frequency,augmented_frequency
little,3,0.272727,1.0
lamb,3,0.272727,1.0
a,3,0.272727,1.0
had,1,0.090909,0.333333
mary,1,0.090909,0.333333


## TFIDF

In [17]:
documents = {
    'news': 'Codeup announced last thursday that they just launched a new data science program. It is 18 weeks long.',
    'description': 'Codeup\'s data science program teaches hands on skills using Python and pandas.',
    'context': 'Codeup\'s data science program was created in response to a percieved lack of data science talent, and growing demand.'
}

In [19]:
docs_cleaned = {}
for key in documents:
    docs_cleaned[key] = prepare.clean(documents[key])

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

In [32]:
tfidf.fit(docs_cleaned.values())
matrix = tfidf.transform(docs_cleaned.values())

In [33]:
matrix

<3x26 sparse matrix of type '<class 'numpy.float64'>'
	with 34 stored elements in Compressed Sparse Row format>

In [35]:
pd.DataFrame(matrix.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,18,announc,codeup,creat,data,demand,grow,hand,lack,last,...,python,respons,scienc,skill,talent,teach,thursday,use,wa,week
0,0.326245,0.326245,0.192686,0.0,0.192686,0.0,0.0,0.0,0.0,0.326245,...,0.0,0.0,0.192686,0.0,0.0,0.0,0.326245,0.0,0.0,0.326245
1,0.0,0.0,0.217184,0.0,0.217184,0.0,0.0,0.367724,0.0,0.0,...,0.367724,0.0,0.217184,0.367724,0.0,0.367724,0.0,0.367724,0.0,0.0
2,0.0,0.0,0.174252,0.295034,0.348504,0.295034,0.295034,0.0,0.295034,0.0,...,0.0,0.295034,0.348504,0.0,0.295034,0.0,0.0,0.0,0.295034,0.0


And that's a quick run down on getting a matrix of the TF-IDF

Let's use that TF-IDF matrix in a Classification model, to try to predict Spam or Ham

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score



In [108]:
df = pd.read_csv('./spam_clean.csv')
df.head()
df['text'] = [prepare.clean(text) for text in df.text]

In [109]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label




In [125]:
y.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

If we take a look, our y values are definitely not balanced when it comes to our variables, so we will definitely need to stratify when it comes to our train_test sampling

In [146]:
pd.DataFrame(X.todense(),columns = tfidf.get_feature_names()).head(15)

Unnamed: 0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zebra,zed,zero,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)



In [143]:
train_ys = pd.DataFrame(dict(actual=y_train))
test_ys = pd.DataFrame(dict(actual=y_test))


In [145]:
X_train

<4457x8062 sparse matrix of type '<class 'numpy.float64'>'
	with 38334 stored elements in Compressed Sparse Row format>

In [113]:

lm = LogisticRegression().fit(X_train, y_train)

train_ys['predicted'] = lm.predict(X_train)
test_ys['predicted'] = lm.predict(X_test)



In [114]:
def predict_spam(string):
    string = tfidf.transform([string])
    return lm.predict(string)[0]

In [119]:
predict_spam()

'spam'