# Word2Vec about Fraud

In [52]:
import nltk
import pandas as pd
# %pylab inline
from nltk.corpus import stopwords
import gensim
from gensim import corpora, models, similarities
import re
import random
import time

In [53]:
oag_data = pd.read_csv("../data/OAG Complaints-Online_Final.csv")

In [54]:
# oag_data.info()

In [55]:
oag_doc = list(oag_data['COMPLAINT_DESCRIPTION'])

In [56]:
# drop duplicate
oag_doc = list(set(oag_doc))
oag_doc = oag_doc[1:]
len(oag_doc)

11635

In [57]:
oag_doc[0]

'Note User indicated supporting documents will be mailed within - days They rent cars for Uber to TLC drivers in NYC  There cars are problematic and inoperableYou can see the google reviews or BBB complaints against themThey have our ---- usd deposit and one week of rent --- plus loss of wages for two drivers ie ---- usd'

In [58]:
# data preprocessing
texts = []
for doc in oag_doc:
    try:
        re.split('\.|\,|\n| ',doc)
        single_doc = []
        for word in re.split('\.|\,|\n| ',doc):
            if word.lower() not in stopwords.words('english') and 'xx' not in word.lower() and len(word)>3 and '$' not in word and '--' not in word:
                single_doc.append(word.lower())
        texts.append(single_doc)
    except:
        pass
len(texts)

11635

In [59]:
print texts[0]

['note', 'user', 'indicated', 'supporting', 'documents', 'mailed', 'within', 'days', 'rent', 'cars', 'uber', 'drivers', 'cars', 'problematic', 'inoperableyou', 'google', 'reviews', 'complaints', 'themthey', 'deposit', 'week', 'rent', 'plus', 'loss', 'wages', 'drivers']


In [60]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [61]:
texts = [[token for token in text if frequency[token] > 1] for text in texts]
len(texts)

In [62]:
# Phrases Detection
bigram = gensim.models.Phrases(texts, threshold=50.0)
phrases_texts = bigram[texts]

In [63]:
print phrases_texts[0]

[u'note_user', u'indicated_supporting', u'documents_mailed', u'within_days', u'rent', u'cars', u'uber', u'drivers', u'cars', u'problematic', u'google', u'reviews', u'complaints', u'themthey', u'deposit', u'week', u'rent', u'plus', u'loss', u'wages', u'drivers']


In [64]:
# transform into dictionary
dictionary = corpora.Dictionary(phrases_texts)

In [65]:
# store the dictionary, for future reference
# dictionary.save('phrases_texts_oag.dict')

In [66]:
print (dictionary)

Dictionary(16159 unique tokens: [u'cussed', u'deferment', u'yellow', u'four', u'woods']...)


# Word2Vec
Train a Word2Vec model on 11635 documents with 16159 unique word

In [71]:
# gensim.models.word2vec.Word2Vec(sentences=None, 
# size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, 
# sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, 
# negative=5, cbow_mean=1, hashfxn=<built-in function hash>, iter=5, 
# null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000)
# train word2vec
model = gensim.models.word2vec.Word2Vec(phrases_texts, size=60, sg=1, window=5, workers=2)

In [69]:
# save the model
# model.save('word2vec_model_oag')

In [72]:
model.most_similar(positive=['fraud'], topn=20)

[(u'theft', 0.7708562612533569),
 (u'victim', 0.7343912124633789),
 (u'finacing', 0.7333649396896362),
 (u'schumacher', 0.7304519414901733),
 (u'others', 0.7286326885223389),
 (u'committed', 0.726068377494812),
 (u'scams', 0.7253360748291016),
 (u'herds', 0.7206159830093384),
 (u'thru_insurnce', 0.7029843330383301),
 (u'chruch', 0.7029628753662109),
 (u'ideas', 0.7014973163604736),
 (u'felony', 0.6972740888595581),
 (u'theft_deception', 0.6961012482643127),
 (u'scripts', 0.69463050365448),
 (u'commit', 0.6936841011047363),
 (u'thru', 0.693616509437561),
 (u'daley', 0.6882930994033813),
 (u'committing', 0.6829313039779663),
 (u'herds_fraud', 0.6807495355606079),
 (u'mccarty', 0.6803163290023804)]

In [78]:
model.most_similar(positive=['google'], topn=20)

[(u'search', 0.9254671931266785),
 (u'facebook', 0.8930904865264893),
 (u'social_media', 0.8922886848449707),
 (u'websites', 0.8822185397148132),
 (u'googled', 0.8766440749168396),
 (u'reviews', 0.870497465133667),
 (u'searched', 0.8674001693725586),
 (u'reveals', 0.8508349657058716),
 (u'yelp', 0.8448924422264099),
 (u'blog', 0.8439939618110657),
 (u'google_search', 0.8417780995368958),
 (u'posts', 0.840400218963623),
 (u'sites', 0.8362839221954346),
 (u'logo', 0.8324384689331055),
 (u'similar_stories', 0.8282687664031982),
 (u'links', 0.8256586790084839),
 (u'facebook_page', 0.8250905275344849),
 (u'numerous_complaints', 0.824032187461853),
 (u'negative_reviews', 0.8225080370903015),
 (u'tinder', 0.8215990662574768)]

In [108]:
model.most_similar(positive=["attorney_general"], topn=20)

[(u'attorney_generals', 0.9439666867256165),
 (u'schneiderman', 0.859863817691803),
 (u'consumer_affairs', 0.8544109463691711),
 (u'filing_complaint', 0.8480504751205444),
 (u'consumer_protection', 0.8477827906608582),
 (u'investigate', 0.8434239625930786),
 (u'offices', 0.834975004196167),
 (u'better_business', 0.8293050527572632),
 (u'file_complaint', 0.8245018720626831),
 (u'formal_complaint', 0.82379150390625),
 (u'cfpb', 0.8180173635482788),
 (u'federal_trade', 0.8135943412780762),
 (u'article', 0.8086038827896118),
 (u'respectfully', 0.807357907295227),
 (u'lodge', 0.8036432266235352),
 (u'filed_complaint', 0.8001582622528076),
 (u'bureau', 0.791593074798584),
 (u'counsel', 0.7896642684936523),
 (u'refer', 0.7821816802024841),
 (u'division_consumer', 0.781112551689148)]

In [84]:
company = ['uber','chase','citibank','google','facebook','yelp','apple','instagram','starbucks','amazon','airbnb','sony','dell','tinder','linkedin']
sim = [(i,model.similarity('fraud', i)) for i in company]
sorted(sim, key=lambda item: -item[1])

[('tinder', 0.57085916674941717),
 ('yelp', 0.52957863637917491),
 ('instagram', 0.51811455584390576),
 ('uber', 0.50122224942553362),
 ('starbucks', 0.49795903338558251),
 ('citibank', 0.4879619329971665),
 ('linkedin', 0.48762062185402388),
 ('chase', 0.46189347014889526),
 ('google', 0.45818626054274281),
 ('airbnb', 0.40217002622500914),
 ('facebook', 0.36000646193443331),
 ('sony', 0.34183986599785965),
 ('amazon', 0.33733224012772467),
 ('apple', 0.33393577392658724),
 ('dell', 0.28123713377173432)]