In [48]:
# Grab the data from here: http://share.mailcharts.com/0D0Q2e0L1s47 and http://share.mailcharts.com/0z0F3m1X0l39
import pandas as pd
import numpy as np
import os
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import ngrams
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans as KM 
from sklearn.cluster import AgglomerativeClustering as AG
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
os.chdir('C:\\Users\\liuth\Documents\\Python Scripts\\ThinkfulProjects\Data')

In [3]:
df = pd.read_csv("capstone-v2.csv", encoding="ISO-8859-1")

df = df.drop_duplicates("email_guid")

print(df.shape)
print(df.columns)

(3346, 8)
Index(['reg_id', 'add_id', 'name', 'email_guid', 'sent_at', 'subject',
       'full_text', 'r'],
      dtype='object')


In [4]:
df.head(5)

Unnamed: 0,reg_id,add_id,name,email_guid,sent_at,subject,full_text,r
0,6361,7526,Le Creuset,45f2d9ed-128e-9ae5-b8f1-4e224a02dfca,2017-01-10 21:34:33,"Welcome, Lorem Ipsum!",LE CREUSET Welcome to Le Creuset. To log in wh...,1
1,6361,7526,Le Creuset,34db5cee-2a9c-17f4-b97d-68343ad26f19,2017-01-10 21:36:48,"Hi! You were looking for free shipping, right?",Save a bundle on shipping with code LECREUSETL...,2
2,6361,7526,Le Creuset,55f96ec8-739f-4a3a-63c4-ec1fddcf795d,2017-01-10 21:41:43,Le Creuset: New Order # 200068673,LE CREUSET Thank you for your order from Le Cr...,3
3,6361,7526,Le Creuset,f0188d30-aa26-8614-7a44-aa149fad66b0,2017-01-12 21:37:00,Your kitchen + our color choices = food heaven,What will you bring to the table? View in brow...,4
4,6361,7526,Le Creuset,29af73ae-8956-9bdc-7148-d5eb0bde173b,2017-01-13 15:09:04,Free Shipping Starts Now + Storage Staples to ...,"Plus, a sweet treat for you! LE CREUSET Cookwa...",5


In [5]:
# Steps: Clean up text, stemming, remove stop words and weird chars, tokenizer words
# punctuation = list(set(string.punctuation))
re_punctuation = "\.|\>|\/|\)|\"|\(|\}|\'|\_|\-|\$|\:|\[|\^|\+|\?|\`|\~|\!|\<|\@|\;|\=|\*|\\\|\{|\&|\]|\||\,|\|"
stopwords_set = list(set(stopwords.words('english')))
handpicked_works = ["com"]

def get_unigram_sentence(sentence, company_name):
    company_names = company_name.lower().split(" ")
    company_names.append(company_name.lower().replace(" ", ""))
    
    sentence_no_punc = re.sub(re_punctuation, " ", sentence)
    unigram = [word for word in word_tokenize(sentence_no_punc.lower()) if word not in stopwords_set and word not in company_names and word not in handpicked_works]
    return unigram

In [6]:
tokenized_text = []

for i, el in df.iterrows():
    tokenized_text.append(get_unigram_sentence(el['subject'], el['name']))

df["tokenized_text"] = tokenized_text

In [7]:
stemmer = PorterStemmer()

def get_stems(words):
    return [stemmer.stem(word) for word in words]

In [8]:
df["stemmed_tokens"] = df.tokenized_text.apply(lambda x: get_stems(x))
df["stemmed_text"] = df["stemmed_tokens"].apply(lambda x: " ".join(word for word in x))

In [9]:
# vectorizer = TfidfVectorizer(stop_words="english", min_df=50, ngram_range=(1,2))
vectorizer = CountVectorizer(stop_words="english", min_df=50)

vectorizer = vectorizer.fit(df["stemmed_text"])
X = vectorizer.transform(df["stemmed_text"])
X = X.toarray()

In [10]:
vectorized_values = X

lda = LDA(6, random_state=22)
lda.fit(vectorized_values)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=6, n_jobs=None,
                          perp_tol=0.1, random_state=22, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [11]:
labels = []

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]])
        labels.append([(vectorizer.get_feature_names()[i]) for i in topic.argsort()[:-1-1:-1]][0])
        print("=" * 100)

print_topics(lda, vectorizer)

Topic 0:
[('order', 602.1915256475438), ('confirm', 236.16604261654658), ('way', 90.16608363707132), ('shipment', 56.16655909372359), ('deliveri', 23.05367572445895), ('receiv', 0.16756098036835607), ('thank', 0.16713000518204474), ('ship', 0.16700231862997877), ('account', 0.16681785094785354), ('10', 0.16680792771428085)]
Topic 1:
[('thank', 194.1656701073239), ('save', 126.16423957595113), ('gift', 93.16551993348605), ('shop', 92.16497189706146), ('15', 0.35531867285716023), ('order', 0.1674975299517836), ('10', 0.16742602419553165), ('purchas', 0.16742071943010997), ('25', 0.16736771114890517), ('holiday', 0.16732600899418787)]
Topic 2:
[('welcom', 303.1656540559297), ('ship', 294.16544958874437), ('free', 168.16424852426377), ('order', 148.14027174193595), ('15', 35.76853519981635), ('10', 0.34035150154350086), ('deliveri', 0.16778540216247048), ('25', 0.16725388319581838), ('end', 0.16715097928250777), ('today', 0.16709417332000442)]
Topic 3:
[('sale', 153.34330470836363), ('50',

In [12]:
# Next, let's assign this to our original text
vectorized_values_lda = lda.transform(vectorized_values)
predicted_label = []
for i in vectorized_values_lda:
    # Get the highest value. We'll consider that to be the predicted label.
    predicted_label.append(labels[i.argsort()[-1]])
df["lda_predicted_label"] = predicted_label

In [13]:
_model = NMF(6, random_state=22)
_model.fit(vectorized_values)

labels = []
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]])
        labels.append([(vectorizer.get_feature_names()[i]) for i in topic.argsort()[:-1-1:-1]][0])
        print("=" * 100)
print_topics(_model, vectorizer)

vectorized_values_model = _model.transform(vectorized_values)

predicted_label = []
for i in vectorized_values_model:
    # Get the highest value. We'll consider that to be the predicted label.
    predicted_label.append(labels[i.argsort()[-1]])

df["nmf_predicted_label"] = predicted_label

Topic 0:
[('order', 5.606824840852225), ('thank', 0.8316709164353175), ('way', 0.4105117487412752), ('ship', 0.3071491839432591), ('receiv', 0.30191834487859615), ('shipment', 0.16984440795643285), ('10', 0.06529432280049217), ('shop', 0.05165021201837265), ('deliveri', 0.03067125141257726), ('15', 0.023221694060237254)]
Topic 1:
[('ship', 3.9381823415025172), ('free', 2.5549554884881016), ('day', 0.3883710977190596), ('10', 0.21359066917282063), ('today', 0.17763187121360932), ('end', 0.14585135710459926), ('50', 0.1428636414713717), ('holiday', 0.13925627594223655), ('25', 0.13825795971340046), ('20', 0.13765613276170413)]
Topic 2:
[('welcom', 4.0755596630223), ('lorem', 0.5978172915638713), ('thank', 0.2949135869388419), ('15', 0.263887056248793), ('gift', 0.20149521065157275), ('10', 0.17781138869136037), ('account', 0.17482972344820127), ('purchas', 0.09113878686110366), ('start', 0.0634533974031104), ('20', 0.06134579442708895)]
Topic 3:
[('new', 4.061018929182856), ('account', 0

In [14]:
_model = KM(6, random_state=22)
_model.fit(vectorized_values)

labels = []
def print_topics(model, vectorizer, top_n=3):
    for i, t in enumerate(_model.cluster_centers_):
        top_words = t.argsort()[:-3:-1]
#         for w in top_words:
#             print(vectorizer.get_feature_names()[w])
        labels.append(vectorizer.get_feature_names()[top_words[0]])
#         print("=" * 100)
print_topics(_model, vectorizer)

vectorized_values_model = _model.transform(vectorized_values)

predicted_label = []
for i in vectorized_values_model:
    # Get the highest value. We'll consider that to be the predicted label.
    predicted_label.append(labels[i.argsort()[-1]])

df["km_predicted_label"] = predicted_label

In [15]:
# Load manual classification
classified = pd.read_csv("captsone-manually-classified.csv")

# Fill blanks with 0
classified.fillna(0, inplace=True)

# Remove any "?" and replace with 0
classified["order confirmation"][classified["order confirmation"] == "?"] = 0

# Print some quick summary stats
print(classified["order confirmation"].value_counts())
print(classified["shipping confirmation"].value_counts())
print(classified["delivery notification"].value_counts())

0    1717
1     206
Name: order confirmation, dtype: int64
0.0    1769
1.0     154
Name: shipping confirmation, dtype: int64
0.0    1906
1.0      17
Name: delivery notification, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [16]:

def get_classification_results(row):
    if row["order confirmation"] != 0:
        return "order"
    if row["shipping confirmation"] != 0:
        return "ship"
    if row["delivery notification"] != 0:
        return "delivery"
    else:
        return "not classified"

classified["manual_label"] = classified.apply(get_classification_results, axis=1)

In [17]:
# Drop duplicates
manual_df = classified.drop_duplicates("email_guid")

In [18]:
df = df.merge(manual_df[["email_guid", "manual_label"]], on="email_guid", how="left")

In [19]:
df["manual_label"].value_counts()

not classified    1539
order              205
ship               151
delivery            17
Name: manual_label, dtype: int64

In [20]:
df.manual_label.fillna("not classified", inplace=True)

In [21]:
# Only focus on emails classified manually

indices = df[df.manual_label != "not classified"].index
y_class = df.loc[indices, "manual_label"]
X_class = X[indices]

In [22]:
lr = LogisticRegression()

In [23]:
lr.fit(X_class, y_class)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
y_predict = lr.predict(X_class)

In [25]:
confusion_matrix(y_class, y_predict)

array([[  0,  12,   5],
       [  0, 205,   0],
       [  0,   9, 142]], dtype=int64)

In [26]:
lr.score(X_class, y_class)

0.9302949061662198

In [27]:
y_class.value_counts()

order       205
ship        151
delivery     17
Name: manual_label, dtype: int64

In [28]:
X_df = pd.DataFrame(X_class)

In [29]:
X_r = df.loc[indices, "r"]
X_r = list(X_r)
X_df["r"] = X_r

In [30]:
lr = LogisticRegression()
lr.fit(X_df, y_class)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
lr.score(X_df, y_class)

0.9436997319034852

In [32]:
y_predict = lr.predict(X_df)

In [33]:
confusion_matrix(y_class, y_predict)

array([[  7,   5,   5],
       [  1, 203,   1],
       [  0,   9, 142]], dtype=int64)

In [34]:
rf = RandomForestClassifier()

In [35]:
rf.fit(X_df, y_class)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [36]:
rf.score(X_df, y_class)

0.9705093833780161

In [37]:
y_predict = rf.predict(X_df)

In [38]:
confusion_matrix(y_class, y_predict)

array([[ 12,   3,   2],
       [  0, 205,   0],
       [  1,   5, 145]], dtype=int64)

In [39]:
# Run against topic modeling
y = df.lda_predicted_label
X_df = pd.DataFrame(X)
X_df["r"] = df.r.values

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.25, random_state=100)

In [41]:
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
lr.score(X_train, y_train)

0.9916301315265046

In [43]:
lr.score(X_test, y_test)

0.984468339307049

In [44]:
y_predict = lr.predict(X_test)

In [45]:
confusion_matrix(y_test, y_predict)

array([[292,   2,   0,   2,   1,   2],
       [  0,  91,   0,   1,   0,   0],
       [  0,   0, 120,   0,   0,   1],
       [  0,   0,   2, 109,   0,   0],
       [  0,   0,   0,   0,  72,   0],
       [  0,   0,   2,   0,   0, 140]], dtype=int64)

In [46]:
#Cross Validate possible

In [49]:
cross_val_score(lr, X_test, y_test, cv=10)



array([0.93181818, 0.95348837, 0.93975904, 0.92771084, 0.89156627,
       0.93975904, 0.92771084, 0.91566265, 0.93975904, 0.98780488])