In [113]:
import re
import pandas as pd
import numpy as np
import regex as regex
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
from nltk import ngrams
from stemming.porter2 import stem


In [114]:
#import dataset
raw_training_data, raw_testing_data = [],[]

with open("train.txt") as f:
    train=f.readlines()
    ttrain=f.read()
for item in train:
    raw_training_data.append(item.strip())
    
with open("test.txt") as f:
    test=f.readlines()
for item in test:
    raw_testing_data.append(item.strip())

In [115]:
#get an idea of the data
freq=pd.Series(' '.join(train[1]).split()).value_counts()[:10]
freq=list(freq.index)
wordcount = {}
# eliminate duplicates, split by punctuation and use case demiliters
for word in ttrain.lower().split('\t', 1):
    word = word.replace(".","")
    word = word.replace(",","")
    word = word.replace(":","")
    word = word.replace("\"","")
    word = word.replace("!","")
    word = word.replace("â€œ","")
    word = word.replace("â€˜","")
    word = word.replace("*","")
    if word not in wordcount:
        wordcount[word] = 1
    else:
        wordcount[word] += 1
#most common words / features
word_counter = collections.Counter(wordcount)
word_counter

Counter({'': 1})

In [116]:
#clean data

def clean(raw_data):
    
    #split into labels and text
    labels=[lab.split('\t', 1)[0] for lab in raw_data]
    training_data= [item.split('\t', 1)[1] for item in raw_data]
    
    labels,training_data
    
    #convert to lowercase, stem / lemmatize
    training_data = [i.lower() for i in training_data]
    training_data = [" ".join([stem(word) for word in sentence.split(" ")]) for sentence in training_data]
        
    #replace links, email_id, currencies, entities etc
    training_data=[re.sub(r'[\w\.-]+@[\w\.-]+', "$EMAIL_ID", i) for i in training_data]
    training_data=[re.sub(r"(<?)http:\S+", "$URL", i) for i in training_data]
    training_data=[re.sub(r"\$\d+", "$CURR", i) for i in training_data]
    training_data=[re.sub(r'\b\d+\b', "$NUM", i) for i in training_data]
    training_data=[re.sub(r'\b(me|her|him|us|them|you)\b', "$ENTITIES", i) for i in training_data]
    
    
    #remove punctuation, special chars, tokenize data
    training_data = [regex.sub(r"[^\P{P}$]+", " ", i) for i in training_data]
    training_data = [re.sub(r"[^0-9A-Za-z/$' ]", " ", i) for i in training_data]
    
    #regularize data w.r.t days, times, months and year
    regex_match_days= r'monday|tuesday|wednesday|thursday|friday|saturday|sunday'
    regex_match_times= r'morning|afternoon|evening'
    regex_match_events= r'after|before|during'
    regex_match_month= r'january|february|march|april|may|june|july|august|september|october|november|december'
    
    training_data = [re.sub(regex_match_days, "$day", i) for i in training_data]
    training_data = [re.sub(regex_match_times, "$times", i) for i in training_data]
    training_data = [re.sub(regex_match_events, "$events", i) for i in training_data]
    training_data = [re.sub(regex_match_month, "$month", i) for i in training_data]
    
    #remove extra spaces and blanks
    training_data = [item.strip() for item in training_data]
    
    #return cleaned data
    return training_data, labels    
    
    

In [117]:
#n-gram based SVM Classification, since problem statement says its a binary Classification 
def get_phrases(text, n):
    
    """#define for 3 n-gram models : unigram, bigram and trigram.
    #bi_grams,tri_grams = "",""
    #bi_l,tri_l= [], []
    
    #get ngrams
    
    #bi_grams = ngrams(text.split(), 2)
    #tri_grams = ngrams(text.split(), 3)
    
    #build, set ngrams"""
    """for grams in bi_grams:
        bi_l.append('_'.join(map(str,grams)))
    for grams in tri_grams:
        tri_l.append('_'.join(map(str,grams)))    
    bstring= ' '.join(bi_l)
    tstring= ' '.join(tri_l)    
    return bstring,tstring"""
    
    n_grams=ngrams(text.split(),n)
    gram_list = []
    for grams in n_grams:
        gram_list.append('_'.join(map(str,grams)))
    gram_string = ' '.join(gram_list)
    print gram_string
    return gram_string

In [118]:
#the research paper attached makes use of TF-IDF, hence ranking constraints will be applied using TF-IDF
def tf_idf(data):
    
    vector_model= TfidfVectorizer(min_df=1) #single dataframe
    X = vector_model.fit_transform(data) #fit model and transform data to required vector
    
    #only one axis holds the ranking -> X axis
    return X

In [119]:
#get the cleaned data : 
training_data, training_labels = clean(raw_training_data)
testing_data, testing_labels = clean(raw_testing_data)

# since labels arent distributed in a random fashion, distribution is either a single set of 'NO' or 'NO' and 'YES' combined.
# get X,Y training data by mixing both, then using k-folds for distribution or check weight biases using jack-knife resampling
# another approach is to mix both data, then split using the train-test-split module.

X = tf_idf(training_data+testing_data)
Y = training_labels+testing_labels

#set random_state to 42 for reproducible results
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)


In [50]:
#tune parameters for c and gamma, use the rbf kernel
#running the function takes time, so it has already been run once and best C and gammas have been identified
#best C for SVM model - 2100, 14,600, 23,200
def get_c_and_gamma(X,Y,nfolds):
    Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10] #lower Cs value gives a simpler decision function
    gammas = [ 0.001, 0.01, 0.1, 1, 10] #inverse of radius of influence

    from sklearn.grid_search import GridSearchCV
    param_grid= {'C':Cs, 'gamma':gammas}
    grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, Y)
    print grid_search.best_params_
    return grid_search.best_params_

{'C': 1, 'gamma': 1}


In [120]:
#basic SVM model vs bi-gram model vs tri-gram model
#result classification can be seen using a confusion_matrix

#new_C=get_c_and_gamma(X,Y,10)
svm=SVC(C=2100, kernel = 'rbf')
svm.fit(x_train, y_train)
print "SVM classification : \n"
print "Training Accuracy : ",svm.score(x_train, y_train),"\nTesting Accuracy : ",svm.score(x_test, y_test)

SVM classification : 

Training Accuracy :  0.8850645359557467 
Testing Accuracy :  0.7813620071684588


In [121]:
#bi-gram and tri-gram SVM model
total_data = training_data+testing_data
total_labels = training_labels+testing_labels

bi_gram_data = [get_phrases(item,2) for item in total_data]
tri_gram_data = [get_phrases(item,3) for item in total_data]

X2, Y2 = tf_idf(bi_gram_data), total_labels
X3, Y3 = tf_idf(tri_gram_data), total_labels

x_train1, x_test1, y_train1, y_test1 = train_test_split(X2, Y2, test_size=0.3, random_state=42)
x_train2, x_test2, y_train2, y_test2 = train_test_split(X3, Y3, test_size=0.3, random_state=42)
    

$NUM_contact contact_$ENTITIES $ENTITIES_now now_to to_make make_$CURR $CURR_today today_$link
act_now now_to to_keep keep_your your_life life_on on_the the_go
choos_between between_$CURR $CURR_and and_$CURR $CURR_dollar dollar_with with_up up_to to_$NUM $NUM_year year_to to_repay
click_abov abov_to to_earn earn_today
click_here here_to to_receiv receiv_your your_first first_$CURR $CURR_today
click_here here_to to_start start_shop shop_now now_$link
click_here here_to to_watch watch_this this_now now_i i_m m_not not_sure sure_how how_long long_he he_ll ll_leav leav_this this_up up_in in_public
confirm_now now_and and_view view_your your_first first_great great_opportun
copi_and and_past past_this this_link link_into into_your your_browser browser_to to_see see_your your_result result_now
find_out out_how how_to to_add add_clip clip_to to_your your_pressroom pressroom_in in_this this_$NUM $NUM_second second_video
go_here here_now now_for for_direct direct_access
go_here here_now now_for

that_what what_confus confus_$ENTITIES $ENTITIES_becaus becaus_i i_rememb rememb_$ENTITIES $ENTITIES_send send_in in_a a_fax fax_nom
the_cpuc cpuc_get get_out out_of of_the the_way way_and and_stop stop_the the_practic practic_of of_$events $events_the the_fact fact_review review_of of_power power_purchases
the_cpuc cpuc_get get_out out_of of_the the_way way_and and_stop stop_the the_practic practic_of of_$events $events_the the_fact fact_review review_of of_power power_purchases
the_celtic celtic_covet covet_$ENTITIES $ENTITIES_too too_but but_cleveland cleveland_might might_not not_give give_$ENTITIES $ENTITIES_the the_chanc chanc_becaus becaus_of of_the the_cav cav_need need_for for_size size_with with_the the_uncertainti uncertainti_in in_their their_frontcourt
the_celtic celtic_need need_size size_but but_can can_t t_let let_a a_scorer scorer_get get_past past_$ENTITIES $ENTITIES_in in_the the_draft
the_east east_desk desk_is is_have have_a a_meet meet_tomorrow tomorrow_at at_the 

look_good good_to to_$ENTITIES $ENTITIES_i i_think think_$ENTITIES $ENTITIES_ought ought_to to_have have_$ENTITIES $ENTITIES_file file_it it_asap
mark_go go_ahead ahead_and and_set set_up up_the the_meeting
mark_note note_the the_feedback feedback_from from_the the_aa aa_s s_it it_is is_imper imper_that that_we we_get get_a a_handl handl_on on_this
$monthb_we we_can can_have have_lunch lunch_or or_something
meet_other other_member member_in in_columbus columbus_and and_contact contact_$ENTITIES $ENTITIES_now
mike_could could_$ENTITIES $ENTITIES_pleas pleas_forward forward_a a_copi copi_of of_the the_document document_list list_below below_to to_ms ms_cantrell
milli_is is_ok ok_however however_i i_would would_like like_a a_quick quick_read read_of of_the the_document document_onc onc_$ENTITIES $ENTITIES_send send_a a_hard hard_copi copi_up
need_to to_get get_peopl peopl_togeth togeth_for for_this this_event
nit_noi noi_sound sound_good good_to to_$ENTITIES $ENTITIES_tonight tonight_if i

are_there_ani there_ani_issu ani_issu_from issu_from_your from_your_point your_point_of point_of_view
are_we_still we_still_hold still_hold_$day hold_$day_$month $day_$month_$NUM $month_$NUM_for $NUM_for_the for_the_et the_et_associ et_associ_and associ_and_analyst and_analyst_mid analyst_mid_year mid_year_$NUM year_$NUM_prc
ariba_has_hail has_hail_the hail_the_acquisit the_acquisit_as acquisit_as_one as_one_that one_that_give that_give_it give_it_best it_best_of best_of_bre of_bre_auction bre_auction_capability auction_capability_which capability_which_it which_it_would it_would_integr would_integr_into integr_into_it into_it_own it_own_suit own_suit_of suit_of_softwar of_softwar_products
as_i_also i_also_mentioned also_mentioned_the mentioned_the_first the_first_weekend first_weekend_in weekend_in_$month in_$month_is $month_is_when is_when_the when_the_roundtop the_roundtop_antiqu roundtop_antiqu_festiv antiqu_festiv_is
as_martin_and martin_and_i and_i_had i_had_mentioned had_mention

when_$ENTITIES_interview $ENTITIES_interview_for interview_for_a for_a_job a_job_$ENTITIES job_$ENTITIES_will $ENTITIES_will_answer will_answer_the answer_the_same the_same_question same_question_almost question_almost_ever almost_ever_time ever_time_$ENTITIES time_$ENTITIES_discuss $ENTITIES_discuss_the discuss_the_posit the_posit_with posit_with_the with_the_hire the_hire_manager
when_your_friend your_friend_see friend_see_your see_your_note your_note_they note_they_can they_can_pray can_pray_for pray_for_$ENTITIES for_$ENTITIES_and $ENTITIES_and_comment and_comment_to comment_to_give to_give_encouragement
where_do_$ENTITIES do_$ENTITIES_want $ENTITIES_want_to want_to_go
where_is_my is_my_free my_free_lunch free_lunch_and lunch_and_my and_my_new my_new_power new_power_golf power_golf_shirt
whether_read_an read_an_email an_email_that email_that_remind that_remind_$ENTITIES remind_$ENTITIES_to $ENTITIES_to_schedul to_schedul_a schedul_a_famili a_famili_get famili_get_togeth get_togeth_

ina_could_$ENTITIES could_$ENTITIES_pleas $ENTITIES_pleas_get pleas_get_$ENTITIES get_$ENTITIES_access $ENTITIES_access_to access_to_ect to_ect_trade ect_trade_on trade_on_the on_the_o the_o_drive
is_there_a there_a_chanc a_chanc_of chanc_of_set of_set_up set_up_a up_a_call a_call_in call_in_number in_number_for number_for_these for_these_meetings
is_there_an there_an_agenda an_agenda_for agenda_for_the for_the_meet the_meet_on meet_on_the on_the_5th
is_there_one there_one_for one_for_this for_this_week
is_this_case this_case_due case_due_today due_today_or today_or_next or_next_$day
it_seem_to seem_to_$ENTITIES to_$ENTITIES_that $ENTITIES_that_we that_we_should we_should_tri should_tri_again tri_again_to again_to_get to_get_those get_those_on those_on_the on_the_list the_list_that list_that_are that_are_not are_not_alreadi not_alreadi_member alreadi_member_to member_to_join to_join_isda
it_was_great was_great_to great_to_meet to_meet_$ENTITIES meet_$ENTITIES_and $ENTITIES_and_i and_i_

In [107]:
#SVM C value for bi-grams
best_c={}
for increment in range(1000, 15000, 100):
    svm=SVC(C=increment, kernel = 'rbf')
    svm.fit(x_train1, y_train1)
    tr_ac, te_ac = svm.score(x_train1, y_train1), svm.score(x_test1, y_test1)
    best_c[str(increment)]=te_ac
import operator
print " Max C value : ",max(best_c.iteritems(),key=operator.itemgetter(1))[0]

 Max C value :  14600


In [108]:
#SVM C value for tri-grams
best_tri_c={}
for increment in range(10000, 30000, 100):
    svm=SVC(C=increment, kernel = 'rbf')
    svm.fit(x_train1, y_train1)
    tr_ac, te_ac = svm.score(x_train1, y_train1), svm.score(x_test1, y_test1)
    best_tri_c[str(increment)]=te_ac
import operator
print " Max C value trigrams : ",max(best_tri_c.iteritems(),key=operator.itemgetter(1))[0]

 Max C value trigrams :  23200


In [122]:
#bi-gram SVM max value as C = 14600
svm=SVC(C=14600, kernel='rbf')
svm.fit(x_train1, y_train1)
print "SVM Classification for Bi-Grams : "
print "Training accuracy : ",svm.score(x_train1, y_train1)
print "Testing accuracy : ",svm.score(x_test1, y_test1)

#tri-gram SVM max value as C = 23200
svm=SVC(C=23200, kernel='rbf')
svm.fit(x_train2, y_train2)
print "\nSVM Classification for Tri-Grams : "
print "Training accuracy : ",svm.score(x_train2, y_train2)
print "Testing accuracy : ",svm.score(x_test2, y_test2)

SVM Classification for Bi-Grams : 
Training accuracy :  0.9797172710510141
Testing accuracy :  0.7827956989247312

SVM Classification for Tri-Grams : 
Training accuracy :  0.9929317762753535
Testing accuracy :  0.739068100358423
