# Classification of text documents

In [521]:
import nltk
import pandas as pd
# %pylab inline
from nltk.corpus import stopwords
import gensim
from gensim import corpora, models, similarities
import re
import random
import time

import sklearn
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

### Import OAG data

In [73]:
# load OAG online data
oag_data = pd.read_csv("../data/OAG Complaints-Online_Final.csv")
oag_doc = list(oag_data['COMPLAINT_DESCRIPTION'])
print len(oag_doc)

15034


In [74]:
# remove duplicate
oag_doc = list(set(oag_doc))
print oag_doc[:2]

[nan, 'Note User indicated supporting documents will be mailed within - days They rent cars for Uber to TLC drivers in NYC  There cars are problematic and inoperableYou can see the google reviews or BBB complaints against themThey have our ---- usd deposit and one week of rent --- plus loss of wages for two drivers ie ---- usd']


In [75]:
# remove nan
oag_doc = oag_doc[1:]

print len(oag_doc)

11635


### Import Tweets

In [76]:
# load tweets
twitterdata = pd.read_csv("../data/fraud_list_svarmit_location_v1.csv", header=None)
tweet_doc = list(twitterdata.iloc[:,0])
print len(twitterdata)

7401520


In [96]:
# random select 30000 tweets
selected_tweet_index = list(randint(0, len(tweet_doc),100000))

# filter meaningless tweets
selected_tweet = [tweet_doc[i] for i in selected_tweet_index if type(tweet_doc[i]) == str and len(tweet_doc[i].split()) > 20]

print len(selected_tweet)

20076


In [98]:
# Get random 1000000 unselected tweet
unselected_tweet_index =[]
count = 0
for i in randint(0, len(tweet_doc), 3000000):
    if i not in selected_tweet_index:
        unselected_tweet_index.append(i)
        count += 1
        if count == 100000:
            break
            
unselected_tweet = [tweet_doc[i] for i in unselected_tweet_index if type(tweet_doc[i]) == str and len(tweet_doc[i].split()) > 20]

print len(unselected_tweet)

19818


### Sklearn text feature extraction -- TfidfVectorizer
class sklearn.feature_extraction.text.TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]¶

In [99]:
# combine corpus
raw_whole_corpus = oag_doc + selected_tweet
print "The length of OAG data (fraud relevent):", len(oag_doc)
print "The length of general tweets (nonfraud relevent):", len(selected_tweet)
print "The length of whole dataset:", len(raw_whole_corpus)

The length of OAG data (fraud relevent): 11635
The length of general tweets (nonfraud relevent): 20076
The length of whole dataset: 31711


In [100]:
# Creat target
target = [1]*len(oag_doc)+[0]*len(selected_tweet)
print target[:10]
print target[-10:]
print len(target)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
31711


In [116]:
raw_whole_corpus[0]

'Note User indicated supporting documents will be mailed within - days They rent cars for Uber to TLC drivers in NYC  There cars are problematic and inoperableYou can see the google reviews or BBB complaints against themThey have our ---- usd deposit and one week of rent --- plus loss of wages for two drivers ie ---- usd'

In [101]:
# Sklearn text feature extraction -- TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10, stop_words='english')
X = vectorizer.fit_transform(raw_whole_corpus)
X

<31711x7644 sparse matrix of type '<type 'numpy.float64'>'
	with 693412 stored elements in Compressed Sparse Row format>

### Train Classification model with Naive Bayes, SVM linear, Random Forest

In [102]:
# Train test split
X_train, X_test, target_train, target_test = train_test_split(X.toarray(), target, test_size=0.3, random_state=1)

In [103]:
# Naive Bayes Classification
start_time = time.time()
gnb = GaussianNB()
gnb.fit(X_train, target_train)
pred = gnb.predict(X_test)
end_time = time.time()
print("Accuracy by Naive Bayes =", 1.0*sum(target_test==pred)/len(pred))
print "Training it took %.1f mins" %((end_time-start_time)/60.0)

('Accuracy by Naive Bayes =', 0.9612150515030481)
Training it took 0.2 mins


In [108]:
# SVM linear
start_time = time.time()
svc = SVC(kernel='linear')
svc.fit(X_train, target_train)
pred = svc.predict(X_test)
end_time = time.time()
print("Accuracy by SVM with linear =", 1.0*sum(target_test==pred)/len(pred))
print "Training it took %.1f mins" %((end_time-start_time)/60.0)

('Accuracy by SVM with linear =', 0.9910657977717049)
Training it took 11.9 mins


In [109]:
# Random Forest
start_time = time.time()
clf = RandomForestClassifier(n_jobs=-1, n_estimators=500)
clf = clf.fit(X_train, target_train)
pred = clf.predict(X_test)
end_time = time.time()
print("Accuracy by Random Forest =", 1.0*sum(target_test==pred)/len(pred))
print "Training it took %.1f mins" %((end_time-start_time)/60.0)

('Accuracy by Random Forest =', 0.9850746268656716)
Training it took 7.5 mins


### Test model on unselected Tweets

In [104]:
print len(unselected_tweet)

19818


In [105]:
# Extracting features from the test data using the same vectorizer
X_unselected_tweet = vectorizer.transform(unselected_tweet)
X_unselected_tweet

<19818x7644 sparse matrix of type '<type 'numpy.float64'>'
	with 200690 stored elements in Compressed Sparse Row format>

In [110]:
# Apply SVM model on unselected Tweets
svc_pred = svc.predict(X_unselected_tweet.toarray())

In [111]:
pred_fraud_tweet_index = [i for i,j in enumerate(svc_pred) if j ==1]
len(pred_fraud_tweet_index)

77

In [115]:
for i in pred_fraud_tweet_index[:20]:
    print unselected_tweet[i]
    print '----------------------'

Before draining their bank accounts dry. Or worse commit crimes and pin it on the victim. I keep getting phone calls from one scammer 2/
----------------------
@mb @ElaineChase did you not see the two Idyllic Tutors on the book? Also its a very artifact-or-enchantment looking book, not spell-looking
----------------------
I would probably have at the very least another extra  $1000 in my bank account if I stopped taking cabs everywhere
----------------------
@Pat_The_Phantom I work Operating Room specifically pre op and recovery so it happens a lot. Children especially. I sing with them most days
----------------------
Consumer: Media tells me you're good. Here's money.

Obviously [insert artist] is the greatest of all time. Look at the sales.
----------------------
in honor of being called Fara at our track meet at DR I will be going by Fara for the rest of the school year.
----------------------
After throwing my outdoor plants behind a shed, my landlord moved the plants INSIDE my ho

In [113]:
# Apply random forest model on unselected Tweets
clf_pred = clf.predict(X_unselected_tweet.toarray())

clf_pred_fraud_tweet_index = [i for i,j in enumerate(clf_pred) if j ==1]
print len(clf_pred_fraud_tweet_index)

268


In [114]:
for i in clf_pred_fraud_tweet_index[:20]:
    print unselected_tweet[i]
    print '----------------------'

He's not striking out much. Getting on base. Making contact. Running bases well and even the oldest and lamest fan can't knock his hustle.
----------------------
Never met @DaveNavarro but I was once behind Perry Farrell at an Am/Pm near Sunset Strip at 3am. That was interesting.
----------------------
Before draining their bank accounts dry. Or worse commit crimes and pin it on the victim. I keep getting phone calls from one scammer 2/
----------------------
@CopsInTraining Do Notts response PC 's keep the crimes they report? Or  are they handed over for secondary investigation? Good luck in PPD!
----------------------
When a hot foreign exchange student asks Jordan on a coffee date and I can't even get my fuckin dog to hug me for .02 seconds
----------------------
I tweet about junk food so much that all of the promoted tweets on my feed are for cookies, candy or crackers, and nothing else 
----------------------
you're investigating suspicious activity from "Emergency" In other folk

In [106]:
# Apply Naive Bayes model on unselected Tweets
gnb_pred = gnb.predict(X_unselected_tweet.toarray())

gnb_pred_fraud_tweet_index = [i for i,j in enumerate(gnb_pred) if j ==1]
print len(gnb_pred_fraud_tweet_index)

1034


In [107]:
for i in clf_pred_fraud_tweet_index[:20]:
    print unselected_tweet[i]
    print '----------------------'

@BernieSanders I may have been raped at the Murfreesboro, TN VA hospital at the request of the corrupt Rutherford County Sheriff's Office.
----------------------
RT @CloydRivers: What pays for Free Stuff? Taxes.
Who pays taxes? You do.
Therefore, you pay for all Bernies Free Stuff. Merica. https
----------------------
.@Uber @lyft the people of Austin have spoken and you won't respect them. You'll have better luck buying off the worms in the #txlege
----------------------
RT @R_A_Ziemkiewicz: No pkn ze miechu Pacia od Petru labidzi e audyt zodziejstw POPSL zaszkodzi nam zagranico A donosy na rzekomy aut
----------------------
RT @Igy__: Nema kod nas u novinama, ima u Guardian
Serbs rally against shady demolitions after masked crew 'tied up witnesses'
https://t.co
----------------------
RT @RZIMhq: "We refuse to believe that the bank of justice is bankrupt." -Dr. King 
// A reflection in #SliceOfInfinity: https://t.co/sC76g
----------------------
RT @michaelianblack: Donald Trump: has n

### Try more Category on training data

In [264]:
labeled_data = pd.read_pickle('../data/OAG_corpus_with_CUSP_code.pkl')
len(labeled_data)

15000

In [265]:
labeled_data = labeled_data[labeled_data['CUSP_NAAG']!='None']
labeled_data = labeled_data.dropna(subset=['COMPLAINT_DESCRIPTION'], how='all')
labeled_data = labeled_data.drop_duplicates(subset=['COMPLAINT_DESCRIPTION'])
print len(labeled_data)

10054


In [282]:
labeled_data['cusp_code'] = labeled_data['cusp_code'].replace(0,1)

In [443]:
Index_table = labeled_data[['cusp_code','CUSP_NAAG','NAAG_DESCRIPTION']].groupby(['cusp_code','CUSP_NAAG']).count().rename(columns={'NAAG_DESCRIPTION': 'Count'})
Index_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
cusp_code,CUSP_NAAG,Unnamed: 2_level_1
1,Residential,873
2,scams,1291
3,misc,2152
4,financial,1552
5,good_n_services,1379
6,government,300
7,utilities,922
8,travel,1585


In [563]:
description_table = labeled_data[['CUSP_NAAG','NAAG_DESCRIPTION']].groupby(['CUSP_NAAG']).sum()
for i in description_table.index.values:
    print i,":"
    print list(set(description_table.loc[i,:][0].split(":")))
    print "-----"

Residential :
[' New Home SalesServices', ' Real Estate Brokers/ApartmentHome Repair/Construction', ' OtherServices', ' Home Improvement/Repair/ContractorsHome Repair/Construction', ' HarassmentLandlord/Tenant', ' Real Estate Brokers/ApartmentLandlord/Tenant', ' Rent Security Deposits or Security InterestCampgrounds/Time SharesHome Repair/Construction', ' Real Estate Brokers/ApartmentCampgrounds/Time SharesHome Repair/Construction', ' Mobile (manufactured) Home Sales, Warranties and ParksHome Repair/Construction', ' Home Improvement/Repair/ContractorsInsurance', ' Rent Security Deposits or Security InterestInsurance', ' Homeowner/Rental InsuranceLandlord/Tenant', ' Rent Security Deposits or Security InterestLandlord/Tenant', ' Homeowner/Rental InsuranceCampgrounds/Time SharesHome Repair/Construction', ' HarassmentHome Repair/Construction', ' Real Estate Brokers/ApartmentInsurance', ' OtherCampgrounds/Time SharesHome Repair/Construction', 'Home Repair/Construction', ' Rent Security Depo

In [267]:
labeled_oag_doc = list(labeled_data['COMPLAINT_DESCRIPTION'])

In [279]:
# combine corpus
raw_whole_corpus_labled = labeled_oag_doc + selected_tweet
print "The length of OAG data (fraud relevent):", len(labeled_oag_doc)
print "The length of general tweets (nonfraud relevent):", len(selected_tweet)
print "The length of whole dataset:", len(raw_whole_corpus_labled)

The length of OAG data (fraud relevent): 10054
The length of general tweets (nonfraud relevent): 20076
The length of whole dataset: 30130


In [284]:
# Creat target, label general tweets as one
target = list(labeled_data['cusp_code'])+[0]*len(selected_tweet)
print target[:10]
print target[-10:]
print len(target)

[2, 4, 8, 2, 5, 7, 1, 4, 3, 8]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
30130


In [288]:
# Sklearn text feature extraction -- TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10, stop_words='english')
X = vectorizer.fit_transform(raw_whole_corpus_labled)
X

<30130x7264 sparse matrix of type '<type 'numpy.float64'>'
	with 623712 stored elements in Compressed Sparse Row format>

In [289]:
# Train test splitd
X_train, X_test, target_train, target_test = train_test_split(X.toarray(), target, test_size=0.3, random_state=1)

In [291]:
# Naive Bayes Classification
start_time = time.time()
gnb_9cate = GaussianNB()
gnb_9cate.fit(X_train, target_train)
pred = gnb_9cate.predict(X_test)
end_time = time.time()
print("Accuracy by Naive Bayes =", 1.0*sum(target_test==pred)/len(pred))
print "Training it took %.1f mins" %((end_time-start_time)/60.0)

('Accuracy by Naive Bayes =', 0.7112512446067043)
Training it took 0.3 mins


In [292]:
# SVM linear
start_time = time.time()
svc = SVC(kernel='linear')
svc.fit(X_train, target_train)
pred = svc.predict(X_test)
end_time = time.time()
print("Accuracy by SVM with linear =", 1.0*sum(target_test==pred)/len(pred))
print "Training it took %.1f mins" %((end_time-start_time)/60.0)

('Accuracy by SVM with linear =', 0.8621528930191393)
Training it took 33.5 mins


In [293]:
# Random Forest
start_time = time.time()
clf = RandomForestClassifier(n_jobs=-1, n_estimators=500)
clf = clf.fit(X_train, target_train)
pred = clf.predict(X_test)
end_time = time.time()
print("Accuracy by Random Forest =", 1.0*sum(target_test==pred)/len(pred))
print "Training it took %.1f mins" %((end_time-start_time)/60.0)

('Accuracy by Random Forest =', 0.8232105321385109)
Training it took 8.5 mins


In [295]:
# test on unselected tweet
# Extracting features from the test data using the same vectorizer
X_unselected_tweet = vectorizer.transform(unselected_tweet)
X_unselected_tweet

<19818x7264 sparse matrix of type '<type 'numpy.float64'>'
	with 199830 stored elements in Compressed Sparse Row format>

In [296]:
# Apply SVM model on unselected Tweets
svc_pred = svc.predict(X_unselected_tweet.toarray())

2

In [444]:
Index_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
cusp_code,CUSP_NAAG,Unnamed: 2_level_1
1,Residential,873
2,scams,1291
3,misc,2152
4,financial,1552
5,good_n_services,1379
6,government,300
7,utilities,922
8,travel,1585


In [445]:
Index_list = Index_table.index.values
Index_list = [i[1] for i in Index_list]
Index_list = ['Irrelevant']+Index_list
print Index_list

['Irrelevant', 'Residential', 'scams', 'misc', 'financial', 'good_n_services', 'government', 'utilities', 'travel']


In [450]:
pred_fraud_tweet_index = []
for a in range(8):
    pred_fraud = [i for i,j in enumerate(svc_pred) if j==a]
    print "The amount of %s tweet: %i" % (Index_list[a], len(pred_fraud))
    pred_fraud_tweet_index.append(pred_fraud)

The amount of Irrelevant tweet: 19788
The amount of Residential tweet: 2
The amount of scams tweet: 4
The amount of misc tweet: 10
The amount of financial tweet: 3
The amount of good_n_services tweet: 3
The amount of government tweet: 0
The amount of utilities tweet: 4


In [456]:
for i,j in enumerate(pred_fraud_tweet_index):
    if i != 0:
        print "Recognized as %s tweet:" % (Index_list[i])
        print "\n"
        for k in j:
            print unselected_tweet[k]
            print "------"
    print "\n"
    print "___________________________________________________________________________________________________________"



___________________________________________________________________________________________________________
Recognized as Residential tweet:


when you live on a hill, in the middle of nowhere, when it rains, your backroads flood and your driveway washes out 
------
@DanConifer @abcnews Heard a lot about N/G 4 years. "why should I pay off my property. Someone else's rent &amp; taxes can pay 4 it"
------


___________________________________________________________________________________________________________
Recognized as scams tweet:


@m_omart Yes, when you paid, there is an email to you, and just reply it because the one is charge of your order.
------
What Do You Get with CSEO? within the first 4 days of your subscription, you'll get a free on-page SEO audit of your website.
------
2/@homeaway: "We can't refund your money even though it was fraud." Why? "Because we sent it to @canadastays". So I call @canadastays
------
@scottmstringer Spoke 2 u @ polling site election day whe

### customize test demo


In [483]:
### customize test demo

test_demo = ["I sent my money to my chase bank, it works really good! I love chase!"]

test_demo_transform = vectorizer.transform(test_demo)
test_result = svc.predict(test_demo_transform.toarray())
print "The test tweet is recognized as %s tweet" % Index_list[test_result[0]]

The test tweet is recognized as Irrelevant tweet


In [528]:
test_demo = ["I sent my money to my chase bank, but it results that I lose my money. I wanna talk with chase bank manager tomorrow"]

test_demo_transform = vectorizer.transform(test_demo)
test_result = svc.predict(test_demo_transform.toarray())
print "The test tweet is recognized as %s tweet" % Index_list[test_result[0]]

The test tweet is recognized as financial tweet
