# Predict Citations with Word to Watch Vectors
Here we try to predict whether a sentence needs a 'citation needed' tag or not based on
1) Word to watch vectors only (english, all languages)
2) Word to watch vectors + main sec indicator


In [45]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.cross_validation import train_test_split
from random import shuffle
import numpy as np
import hashlib

DEPTH_SEARCH=[ 100, 200]
NTREES_SEARCH=[10,30, 50, 100, 200]
TEST_SIZE=0.33
VAL_SIZE=0.1

In [72]:
"""
to use this, you will need: 
1) features from https://drive.google.com/open?id=1JZu67psmj2Eou2-8wQEJk4kAQfg8GDs2, to be placed in ../fastText_multilingual/features
"""

def count_negatives(negatives,positives):
    """
    for balanced data, we need to know how many negatives are out there
    """
    proportion={}
    allneg=0
    for lan in languages:
        proportion[lan]=len(negatives[lan])/float(len(negatives[lan])+len(positives[lan]))
        allneg+=len(negatives[lan])
    print 'proportion of negatives per language'
    print proportion
    return allneg

def get_values_for_crossvalidation(positives,negatives,features):
    """
    positives: list of positives
    negatives: list of negatives
    features: list of feature dictionaries, per type
    """
    values=[]
    y=[]
    ids=[]
    val_values=[]
    val_y=[]
    val_ids=[]
    for lan in languages:
        shuffle(negatives[lan])
        shuffle(positives[lan])
        limit=int(len(positives[lan])*(1-VAL_SIZE))
        print(limit)
        alldata=set(negatives[lan][:limit]+positives[lan][:limit])
        val_alldata=set(negatives[lan][limit:]+positives[lan][limit:])
        ids=ids+list(alldata)
        val_ids=val_ids+list(val_alldata)
        for id in alldata:
            v=[]
            for f in features: #for every type of feature
                if isinstance(f[id], int):
                    v.append(f[id])
                else:
                    for element in f[id]: #append element of feature
                        v.append(element)
            values.append(np.nan_to_num(np.asarray(v)))
            y.append(labels[id])     
        for id in val_alldata:
            v=[]
            for f in features: #for every type of feature
                if isinstance(f[id], int):
                    v.append(f[id])
                else:
                    for element in f[id]: #append element of feature
                        v.append(element)
            val_values.append(np.nan_to_num(np.asarray(v)))
            val_y.append(labels[id])    
    #reshuffle everything for cross_validaton
    ind=range(len(y))
    shuffle(ind)
    y2=[y[i] for i in ind]
    values2=[values[i] for i in ind]
    ids2=[ids[i] for i in ind]
    val_ind=range(len(val_y))
    shuffle(val_ind)
    val_y2=[val_y[i] for i in val_ind]
    val_values2=[val_values[i] for i in val_ind]
    val_ids2=[val_ids[i] for i in val_ind]
    return y2,values2,ids2,val_y2,val_values2,val_ids2

def perform_gridsearch_withRFC(values,y,val_values,val_y):
    """
    values: list of feature vectors
    y: labels
    returns
    max_ind: depth and estimator values
    max_val: crossval prediction accuracy
    scores: all-scores for each combination of depth and nestimators
    """
    scores={}
    #performs cross_validation in all combiantions
    for d in DEPTH_SEARCH:
        for n in NTREES_SEARCH:
            clf = RandomForestClassifier(max_depth=d, n_estimators=n)
            s = cross_val_score(clf, values, y)
            print s
            scores[str(d)+' '+str(n)]=np.mean(s)
    #computes best combination of parameters
    max_ind=''
    max_val=0
    for s in scores:
        if scores[s]>max_val:
            max_val=scores[s]
            max_ind=s
    d=int(max_ind.split(' ')[0])
    n=int(max_ind.split(' ')[1])    
    print max_ind
    print max_val
    clf = RandomForestClassifier(max_depth=d, n_estimators=n)
    clf.fit(values,y)
    prob=clf.predict(val_values)
    print('validation F1:'+str(f1_score(val_y, prob.round())))
    return max_ind,max_val,scores

def train_test_final(val_train,val_test,y_train,d,n):
    """
    just using a Random Forestc classifier on a train/test split for deployment 
    returns model and probability on the test set
    """
    clf = RandomForestClassifier(max_depth=d, n_estimators=n)
    clf.fit(val_train,y_train)
    prob=clf.predict_proba(val_test)
    return clf,prob


In [73]:

def print_top_bottom_sentences(prob,ids_test,y_test,text,labels):
    """
    here we are displaying the 
    """
    pos_proba=(np.asarray(prob).T)[1]
    indexes=np.argsort(-np.asarray(pos_proba))
    for i in indexes[:100]:
        print text[ids_test[i]]
        print y_test[i]
        print labels[ids_test[i]]#checking
    print ('********************************')
    for i in indexes[-40:]:
        print text[ids_test[i]]
        print y_test[i]
        print pos_proba[i]
        print labels[ids_test[i]]#checking

Let's load labels and vectors.. 


Now we load all data for prediction into different variables

In [74]:
languages=['en']
language_extended=['english']

filenames_raw={'en':{'positive':'/home/miriam/Documents/CitationNeeded/uncited-statement-detection/data_final/en_wiki_subset_statements_all_citations_sample.txt', 
            'negative':'/home/miriam/Documents/CitationNeeded/uncited-statement-detection/data_final/en_wiki_subset_statements_no_citations_sample_decomposed.txt'}}

features=['Glove','VERBS','WW']
feature_files={'positive':{}, 'negative':{}}
header=[]
group=[]
for lang in languages:
    for fea in features:
            for p in['positive','negative']:
                feature_files[p][fea] = open(filenames_raw[lang][p][:filenames_raw[lang][p].find('.')]+fea+'.tsv')
                line=feature_files[p][fea].readline()
            header+=[i for i in line[:-1].split('\t')]
            group+=[fea for  i in line[:-1].split('\t')] 
header.append('Main Section')
group.append('main')
print(len(header))

"""
raw header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	statement label
feature header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	 label feature
"""
labels={} #whether it needs a citation or not
vectors={} #the word vectors aligned to english
main={} #is it the main section?
language={} #which language is the article from
pages={} #length of the page
positives={}#statements with citation
negatives={}#statements without citation
text={}#raw text
ww={}
verbs={}
vectors={}
                
for lan in languages:
    positives[lan]=[]
    negatives[lan]=[]
    for p in['positive','negative']:
            f = open(filenames_raw[lan][p], 'r')
            for line in f:
                unique=hashlib.sha224(line).hexdigest() #unique identifier of this line
                row=line[:-1].split('\t')
                if len(row[-2])>10:
                    text[unique] = row[-2] 
                else:
                    continue
                main[unique]= 1 if row[5]=='MAIN_SECTION'else 0
                vectors[unique]=[float(i) for i in feature_files[p]['Glove'].readline()[:-1].split('\t')[1:]]
                ww[unique]=[float(i) for i in feature_files[p]['WW'].readline()[:-1].split('\t')[1:]]
                verbs[unique]=[float(i) for i in feature_files[p]['VERBS'].readline()[:-1].split('\t')[1:]]
                labels[unique]=1 if p=='positive' else 0
                positives[lan].append(unique) if  p=='positive' else negatives[lan].append(unique)



536


In [75]:
allneg=count_negatives(negatives,positives)
print text.values()[:10]

proportion of negatives per language
{'en': 0.5019954533973225}
["Sir Henry Joseph Wood CH (3 March 1869\xc2\xa0\xe2\x80\x93 19 August 1944) was an English conductor best known for his association with London's annual series of promenade concerts, known as the Proms.", 'Clinics see an increase in people seeking testing and treatment for sexually transmitted diseases, and Calgary is said to experience an annual baby boom each April\xc2\xa0\xe2\x80\x93 nine months after the event.', 'O\'Reilly said that Miller and Harvey had counter-attacked with "such joyful abandon that it would have been difficult, if not absolutely impossible, to gather from their methods of going about it that they were actually retrieving a tremendously difficult situation"', 'When viewed through a moderate telescope, the two components\xe2\x80\x94a brighter blue-white star of magnitude 3.9 and a fainter star of magnitude 6.1 that has been described as lilac as well as blue-white\xe2\x80\x94can be seen', 'The exhib

We now select the data for training: all negatives + an equal number of positives, using only feature vectors

In [76]:
y,values,ids,valy,valvalues,valids=get_values_for_crossvalidation(positives,negatives,[ww])
print len(positives['en'])
print len(negatives['en'])
print valy
print valids

8872
9858
9937
[0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 

We now run a grid search to find the good random forest parameter

In [77]:
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y,valvalues,valy)

[ 0.57082488  0.56459249  0.56188705]
[ 0.57183908  0.56594521  0.56053433]
[ 0.56964165  0.56526885  0.56171796]
[ 0.57099391  0.56560703  0.56171796]
[ 0.57099391  0.56577612  0.56188705]
[ 0.57048682  0.56239432  0.56121069]
[ 0.57065585  0.56560703  0.56154887]
[ 0.57048682  0.56526885  0.56290159]
[ 0.57048682  0.56493067  0.56239432]
[ 0.57082488  0.56560703  0.56290159]
200 200
0.566444501761
validation F1:0.312757201646


In [78]:

print ('verbs')
y,values,ids,valy,valvalues,valids=get_values_for_crossvalidation(positives,negatives,[verbs])
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y,valvalues,valy)

verbs
8872
[ 0.69979716  0.700372    0.71068651]
[ 0.70841785  0.70341562  0.7089956 ]
[ 0.70672752  0.70679743  0.71034833]
[ 0.71146045  0.70679743  0.71034833]
[ 0.70875592  0.7089956   0.71322286]
[ 0.71602434  0.7160974   0.71931011]
[ 0.71720757  0.71897193  0.72218465]
[ 0.7214334   0.71846466  0.72319919]
[ 0.7219405   0.72150829  0.72100101]
[ 0.7219405   0.7213392   0.72387555]
200 200
0.722385081671
validation F1:0.681100058514


In [80]:
print ('vectors')
y,values,ids,valy,valvalues,valids=get_values_for_crossvalidation(positives,negatives,[vectors])
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y,valvalues,valy)

vectors
8872
[ 0.70858688  0.70426108  0.69479202]
[ 0.73411089  0.73148461  0.72759554]
[ 0.74256254  0.74112276  0.74010822]
[ 0.74442191  0.74602638  0.7473791 ]
[ 0.75270453  0.75278999  0.75211363]
[ 0.6964165   0.70138654  0.69546838]
[ 0.73529412  0.73977004  0.73283734]
[ 0.74104124  0.74095367  0.73672641]
[ 0.74729547  0.74535002  0.74095367]
[ 0.74881677  0.75735543  0.75042273]
100 200
0.75253604954
validation F1:0.761463163318


In [None]:

print ('ww+verbs')
y,values,ids,valy,valvalues,valids=get_values_for_crossvalidation(positives,negatives,[verbs])
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y,valvalues,valy)

In [83]:

print ('vectors+verbs')
y,values,ids,valy,valvalues,valids=get_values_for_crossvalidation(positives,negatives,[vectors,verbs])
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y,valvalues,valy)

vectors+verbs
8872
[ 0.69776876  0.70054109  0.70679743]
[ 0.73461799  0.73486642  0.73841731]
[ 0.74104124  0.74467366  0.74551911]
[ 0.74290061  0.75126818  0.7473791 ]
[ 0.7525355   0.75481907  0.75448089]
[ 0.70283976  0.69529929  0.69834292]
[ 0.73106829  0.73216097  0.73435915]
[ 0.74459094  0.74196821  0.74163003]
[ 0.74966193  0.74974636  0.74788637]
[ 0.7510142   0.7595536   0.75042273]
100 200
0.75394515438
validation F1:0.738672286617


In [84]:

print ('all')
y,values,ids,valy,valvalues,valids=get_values_for_crossvalidation(positives,negatives,[vectors,verbs,main])
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y,valvalues,valy)

vectors+verbs
8872
[ 0.78701826  0.78559351  0.7926953 ]
[ 0.82775524  0.82871153  0.82668245]
[ 0.83688303  0.82989516  0.83412242]
[ 0.84110886  0.83987149  0.83530605]
[ 0.84465855  0.84392966  0.84308421]
[ 0.79614604  0.80030436  0.78948258]
[ 0.82674104  0.82854244  0.81738248]
[ 0.84330629  0.83074062  0.83429151]
[ 0.84195402  0.84257694  0.83564423]
[ 0.84279919  0.84646601  0.84308421]
200 200
0.844116469486
validation F1:0.85076142132


In [87]:

print ('main')
y,values,ids,valy,valvalues,valids=get_values_for_crossvalidation(positives,negatives,[main])
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y,valvalues,valy)

main
8872
[ 0.77518594  0.78829895  0.7839026 ]
[ 0.77518594  0.78829895  0.7839026 ]
[ 0.77518594  0.78829895  0.7839026 ]
[ 0.77518594  0.78829895  0.7839026 ]
[ 0.77518594  0.78829895  0.7839026 ]
[ 0.77518594  0.78829895  0.7839026 ]
[ 0.77518594  0.78829895  0.7839026 ]
[ 0.77518594  0.78829895  0.7839026 ]
[ 0.77518594  0.78829895  0.7839026 ]
[ 0.77518594  0.78829895  0.7839026 ]
100 30
0.782462497358
validation F1:0.809484193012


We now lok at the effect of adding the 'main' features, i.e. a feature = 1 if the sentence is in the main section

In [32]:
y_m,values_m,ids_m=get_values_for_crossvalidation(positives,negatives,[verbs])

In [33]:
val_train, val_test, y_train, y_test, ids_train, ids_test = train_test_split(values_m, y_m, ids_m, test_size=TEST_SIZE, random_state=42)

In [48]:
dictrain={}

count=1
for i  in ids_train:
    unique=hashlib.sha224(text[i]).hexdigest()
    dictrain[unique]=1

for i  in ids_test:
    unique=hashlib.sha224(text[i]).hexdigest()
    try:
        x=dictrain[unique]
        count+=1
    except:
         dictrain[unique]=1

print ids_train
print(len(values))
    

['544bce7bfebeb99f132415ad19f4f5aa3680c3537089412b0cc1c760', '9f1a29ce1cea85f0a65cebaef0d7e0b66a3768a1aa361fca5ed74f1c', 'da68a3ca53081ca46417bade8673d221e8e74f974cffadd5fe737170', 'a33cb07e0ceacbfef3f102b76a3f5ee10eef468e513b59159bc30ff7', 'e2de77979507668f02624363fccd6f0cb8696a0a93489dad28a6fe51', 'ac33af71055231e7467eccbc38a50d29e20f4a238b06cd7726c9df86', 'fbe5f9bfe439b57441e342143ebcf79152370e6bf4257849168b6279', '0b97b4537894d75227d01aaa9c3a7ee543781f7c25ee1e8095bdc95a', '803d9b4369c7100967fe4a719c4c389ee3ec06311fc4be4bfd7d5c43', 'c5efdfdcf1245e292cd4e72eeb0fd06349e1da2b3b90ba2e122e0a26', '22c6447ac53d40c0d06a490415d29dbe6c14ee5d149bd9eb8d0e7694', 'e7a4c2ef88625b2e938fe0c7ca986b85c14de8ad8b9cce32606ac514', 'aa1dd88ec9bf4fbaa7fe47216b5d52e6fc8155a4dc30819ac714f0e9', '9b6a0b4795c2229d332bbdefefc6a6c910c8c9f21b7d61dc4c2640c1', 'c4272ca9201d2bc04ef96fb1a42c4bef7ed828fea8913f1d5ee3e55d', 'd92292dfd404306f46bc2c95ad1a3ae8713c3d5e8fbe63c5b3a36089', 'f3414457ab93dfd2ad6789ebe4fdc926d62925

In [34]:
clf,prob=train_test_final(val_train,val_test,y_train,200,200)
print_top_bottom_sentences(prob,ids_test,y_test,text,labels)

Punk pinned Cena in a non-title match on the June 13, 2011 episode of Raw, and then became the number one contender by winning a triple threat falls count anywhere match against Alberto Del Rio and Rey Mysterio on the June 20 episode of Raw
1
1
 On 13 October 1898 the appeal was relaunched, with the proposal that if the remaining £3,000 were raised, Watts would design and build a covered way, which in due course would be lined with memorial tablets to commemorate the bravery of ordinary people
1
1
This created the illusion of depth priority, which the computer did not natively support.
0
0
Amidst the best offensive season of his career, McDonald scored a hat trick against Pittsburgh to give him 47 goals at the mid-season break for the 1983 All-Star Game, a total that tied his career high
1
1
The game's poor collision detection was unanimously criticised by reviewers, and Colin Williamson of AllGame also felt that the game's prominent polygon errors were a problem, though he appreciated