In [1]:
import numpy as np
import pymc as pm
import pandas as pd
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OboTh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OboTh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\OboTh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
pattern = r'\b[^\d\W]+\b'
tokenizer = RegexpTokenizer(pattern)
english_stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [4]:
df = pd.read_csv('voted-kaggle-dataset.csv')
df.head()

Unnamed: 0,Title,Subtitle,Owner,Votes,Versions,Tags,Data Type,Size,License,Views,Download,Kernels,Topics,URL,Description
0,Credit Card Fraud Detection,Anonymized credit card transactions labeled as...,Machine Learning Group - ULB,1241,"Version 2,2016-11-05|Version 1,2016-11-03",crime\nfinance,CSV,144 MB,ODbL,"442,136 views","53,128 downloads","1,782 kernels",26 topics,https://www.kaggle.com/mlg-ulb/creditcardfraud,The datasets contains transactions made by cre...
1,European Soccer Database,"25k+ matches, players & teams attributes for E...",Hugo Mathien,1046,"Version 10,2016-10-24|Version 9,2016-10-24|Ver...",association football\neurope,SQLite,299 MB,ODbL,"396,214 views","46,367 downloads","1,459 kernels",75 topics,https://www.kaggle.com/hugomathien/soccer,The ultimate Soccer database for data analysis...
2,TMDB 5000 Movie Dataset,"Metadata on ~5,000 movies from TMDb",The Movie Database (TMDb),1024,"Version 2,2017-09-28",film,CSV,44 MB,Other,"446,255 views","62,002 downloads","1,394 kernels",46 topics,https://www.kaggle.com/tmdb/tmdb-movie-metadata,Background\nWhat can we say about the success ...
3,Global Terrorism Database,"More than 170,000 terrorist attacks worldwide,...",START Consortium,789,"Version 2,2017-07-19|Version 1,2016-12-08",crime\nterrorism\ninternational relations,CSV,144 MB,Other,"187,877 views","26,309 downloads",608 kernels,11 topics,https://www.kaggle.com/START-UMD/gtd,"Context\nInformation on more than 170,000 Terr..."
4,Bitcoin Historical Data,Bitcoin data at 1-min intervals from select ex...,Zielak,618,"Version 11,2018-01-11|Version 10,2017-11-17|Ve...",history\nfinance,CSV,119 MB,CC4,"146,734 views","16,868 downloads",68 kernels,13 topics,https://www.kaggle.com/mczielinski/bitcoin-his...,Context\nBitcoin is the longest running and mo...


In [5]:
df['Description'].head()

0    The datasets contains transactions made by cre...
1    The ultimate Soccer database for data analysis...
2    Background\nWhat can we say about the success ...
3    Context\nInformation on more than 170,000 Terr...
4    Context\nBitcoin is the longest running and mo...
Name: Description, dtype: object

In [6]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
documents = []
def preprocess_document(doc):
    try:
        doc = re.sub(r'[^\x00-\x7F]+', '', doc)
    except:
        return []
    raw = doc.lower()
    
    tokens = tokenizer.tokenize(raw)
    passed_words = [raw.decode('utf-8') for raw in tokens if not raw in english_stop]
    pos = nltk.pos_tag(passed_words)
    pos = [get_wordnet_pos(p[1]) for p in pos]
    token_lemmas = [lemmatizer.lemmatize(word, pos) for pos, word in zip(pos, passed_words)]
    token_lemmas = [raw for raw in token_lemmas if not len(raw) == 1]

    return token_lemmas
    
df['Preprocessed Description'] = df.apply(lambda row: preprocess_document(row['Description']), axis=1)

In [8]:
# eliminate empty-list rows
df = df[df['Preprocessed Description'].map(len) > 0]

In [9]:
df['Preprocessed Description'].head(10)

0    [datasets, contain, transaction, make, credit,...
1    [ultimate, soccer, database, data, analysis, m...
2    [background, say, success, movie, release, cer...
3    [context, information, terrorist, attack, glob...
4    [context, bitcoin, long, run, well, know, cryp...
5    [context, first, time, kaggle, conduct, indust...
6    [iris, dataset, use, fisher, classic, paper, u...
7    [world, development, indicator, world, bank, c...
8    [actually, prepare, dataset, student, deep, le...
9    [data, set, include, pokemon, include, number,...
Name: Preprocessed Description, dtype: object

In [10]:
df['Description'][0]

"The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.\nIt contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case o

In [11]:
' '.join(df['Preprocessed Description'][0])

u'datasets contain transaction make credit card september european cardholder dataset present transaction occur two day frauds transaction dataset highly unbalanced positive class fraud account transaction contain numerical input variable result pca transformation unfortunately due confidentiality issue cannot provide original feature background information data feature principal component obtain pca feature transform pca time amount feature time contain second elapsed transaction first transaction dataset feature amount transaction amount feature use example dependant cost senstive learning feature class response variable take value case fraud otherwise give class imbalance ratio recommend measure accuracy use area precision recall curve auprc confusion matrix accuracy meaningful unbalanced classification dataset collect analysed research collaboration worldline machine learn group http mlg ulb ac ulb universit libre de bruxelles big data mining fraud detection detail current past pro

In [20]:
# limit this to just 50 documents
df = df.iloc[0:50]

In [21]:
def get_word_dictionary():
    work_dictionary = {}
    idx = 0
    for i in range(df.shape[0]):
        document = df['Preprocessed Description'][i]
        for word in document:
            if word not in work_dictionary:
                work_dictionary[word] = idx
                idx += 1
    return work_dictionary

In [22]:
df = df.reindex(pd.RangeIndex(df.index.max() + 1)).ffill()
word_dict = get_word_dictionary()

In [23]:
def place_document_topics(word_dict):
    doc_ids = []
    for i in range(df.shape[0]):
        document = df['Preprocessed Description'][i]
        word_ids = []
        for word in document:
            word_ids += [word_dict[word]]
        doc_ids += [word_ids]
    return np.array(doc_ids)

In [24]:
doc_topics = place_document_topics(word_dict)

In [41]:
k = 10
v = len(word_dict)
d = len(doc_topics)
alpha = np.ones(k)
beta = np.ones(v)
doc_lengths = df['Preprocessed Description'].apply(len).tolist()

In [42]:
def build_dirichlet(name, theta, idx):
    a = pm.Dirichlet(name + str(idx), theta=theta)
    b = pm.CompletedDirichlet('p' + name + str(idx), a)
    return b

def build_categorical(**kwargs):
    comp = pm.Categorical(**kwargs)
    return comp

def add_to_container(components):
    container = pm.Container(components)
    return container

def build_container_components(*args):
    components = []
    for arg in args:
        container = add_to_container(arg)
        components.append(container)
    return components

def build_lda_model():
    theta = [build_dirichlet('theta', alpha, idx) for idx in range(d)]
    phi = [build_dirichlet('phi', beta, idx) for idx in range(k)]
    z = [build_categorical(name='z'+str(idx), p=theta[idx], size=doc_lengths[idx], value=np.random.randint(k, size=doc_lengths[idx])) for idx in range(d)]
    w = [build_categorical(name='w'+str(x)+'.'+str(y), 
           p=pm.Lambda('pz'+str(x)+'.'+str(y), lambda z=z[x][y], phi=phi:phi[z]), value=doc_topics[x][y], observed=True)
           for x in range(d) for y in range(doc_lengths[x])]
    
#     comps = build_container_components(theta, phi, z, w)
#     model = pm.Model(comps)
    model = pm.Model([theta, phi, z, w])  
    mcmc = pm.MCMC(model)
    return mcmc

In [43]:
mcmc = build_lda_model()

In [44]:
mcmc.sample(iter=5000, burn=1000)

 [-----------------100%-----------------] 5000 of 5000 complete in 1652.5 sec

In [45]:
for i in range(d):  
    print(mcmc.trace('z' + str(i))[-1])  

[1 8 6 1 5 7 4 4 1 5 4 7 1 5 8 0 0 1 0 6 8 5 6 5 0 8 5 9 4 1 4 0 4 0 2 0 9
 8 7 9 9 1 7 5 0 4 4 6 5 0 4 1 5 7 9 6 4 5 7 4 5 1 8 5 3 2 1 7 5 2 2 4 4 6
 1 1 1 4 0 6 1 9 6 4 1 7 4 1 7 0 6 0 0 0 9 4 1 8 8 0 1 5 7 1 5 5 4 7 1 5 1
 9 8 8 8 1 1 1 8 7 1 7 7 6 7 0 7 3 8 2 4 4 5 0 0 1 8 2 5 4 8 6 0 4 5 1 8 5
 4 0 7 6 0 5 1 6 1 0 4 1 7 6 9 1]
[6 5 4 8 7 6 0 4 5 8 4 6 2 7 3 4 0 0 6 7 3 3 3 4 2 4 6 8 8 6 0 6 8 2 3 5 3
 6 4 0 6 4 3 4 3 9 9 9 0 6 5 2 7 0 0 0 7 6 0 0 6 4 4 6 8 6 8 2 6 3 8 2 6 2
 7 6 4 4 8 7 2 2 8 8 6 7 7 7 7 5 9 9 2 8 4 2 3 3 3 3 4 7 4 2 0 3 8 5 3 8 8
 3 6 3 4 8 2 1 0 6 6 8 9 8 0 7 6 4 8 7 8 4 6 9 4 5 2 4 3 4 2 8 6 6 6 4 0 3
 0 6 3 8 8 4 8 0 5 2 0 5 4 6 3 1 6 3 5 6 7 2 7 6 6 7 6 6 4 6 6 7 3 6 7 2 2
 0 0 2 3 8 8 3 3 4 9 8 3 8 2 8 2 5 6 0 0 6 6 3 6 5 4 6 4 0 3 5 7 6 0 6 6 8
 0 7 8 3 2 6 2 7 0 2 6 1 2 0 9 0 3 3 3 4 4 8 9 1 2 8 4 0 7 0 0 1 0 0 6 0 2
 8 6 3 6 8 2 0 2 0 3 0 4 0 3 2 4 8 3 7 2 4 9 0 5 0 4 8 6 6 6 0 6 2 4 8 0 8
 0 4 6 3 8 3 2 7 8 0 7 2 8 6 6 8 8 6 8 4 5 2 0 3 8 7]
[4 7 4 0 2 0

In [53]:
mcmc.trace('z' + str(0))[-1]

array([1, 8, 6, 1, 5, 7, 4, 4, 1, 5, 4, 7, 1, 5, 8, 0, 0, 1, 0, 6, 8, 5,
       6, 5, 0, 8, 5, 9, 4, 1, 4, 0, 4, 0, 2, 0, 9, 8, 7, 9, 9, 1, 7, 5,
       0, 4, 4, 6, 5, 0, 4, 1, 5, 7, 9, 6, 4, 5, 7, 4, 5, 1, 8, 5, 3, 2,
       1, 7, 5, 2, 2, 4, 4, 6, 1, 1, 1, 4, 0, 6, 1, 9, 6, 4, 1, 7, 4, 1,
       7, 0, 6, 0, 0, 0, 9, 4, 1, 8, 8, 0, 1, 5, 7, 1, 5, 5, 4, 7, 1, 5,
       1, 9, 8, 8, 8, 1, 1, 1, 8, 7, 1, 7, 7, 6, 7, 0, 7, 3, 8, 2, 4, 4,
       5, 0, 0, 1, 8, 2, 5, 4, 8, 6, 0, 4, 5, 1, 8, 5, 4, 0, 7, 6, 0, 5,
       1, 6, 1, 0, 4, 1, 7, 6, 9, 1])