In [106]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import csv
import re

In [107]:
stopwords = ['MON','TUE','WED','THU','FRI','SAT','SUN',
             'MONDAY','TUESDAY','WEDNESDAY','THURSDAY','FRIDAY','SATURDAY','SUNDAY',
             'JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC',
             'JANUARY','FEBRUARY','MARCH','APRIL','MAY','JUNE','JULY','AUGUST','SEPTEMBER','OCTOBER','NOVEMBER','DECEMBER']
def data():
    #CSV data read
    notams = []
    with open('foreign.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for rows in csv_reader:
            notams.append(rows)
    
    labels = set()
    #data clean
    for i in range(1,len(notams)):
        notams[i][0] = re.sub(r'[^\w\s]','',notams[i][0])
        notams[i][0] = notams[i][0].split('\n')
        lines = ""
        for line in notams[i][0]:
            if "EAIP" not in line:
                lines += line
        notams[i][0] = re.sub(" +"," ",lines)
        for sw in stopwords[:14:-1]:
            notams[i][0] = re.sub(sw," DAYS ",notams[i][0])
        for sw in stopwords[14::-1]:
            notams[i][0] = re.sub(sw," MONTHS ",notams[i][0])
        notams[i][1] = int(eval(notams[i][1]))
        if notams[i][1]>2:
            notams[i][1] = 2
        elif notams[i][1]==2:
            notams[i][1] = 1
        else:
            notams[i][1] = 0
        notams[i][0] = re.sub(' +',' ',notams[i][0])
        labels.add(notams[i][1])
        
    print("AL: LABELS:", labels)
    return notams

In [108]:
notams = data()[1:]
notams[0]

AL: LABELS: {0, 1, 2}


['UTTI TWR OPR HR MONTHS MONTHS THS 05301300 MONTHS THS 05301130 MONTHS THS MONTHS THS CLSD',
 2]

In [109]:
import re
def cleaners(tokens):
    normalized = []
    for token in tokens :
        # remove digits + in range throw x>20 x<2
        if 2<= len(token) <= 20 and not re.search(r'\d', token):
            normalized.append(token)
    return normalized

tagged_notams = [TaggedDocument(words=cleaners(word_tokenize(notam[0].lower())), tags=[str(notam[1])]) for notam in notams]
tagged_notams[-5:]

[TaggedDocument(words=['caution', 'twr', 'lkpr', 'is', 'channel', 'with', 'days', 'arationmake', 'sure', 'about', 'the', 'correct', 'setting', 'on', 'radio', 'panel'], tags=['1']),
 TaggedDocument(words=['due', 'to', 'unreliable', 'aftn', 'feed', 'from', 'pakistankabul', 'area', 'control', 'center', 'will', 'not', 'accept', 'anycoordination', 'information', 'via', 'the', 'aftn', 'system', 'fromthe', 'following', 'firs', 'lahore', 'control', 'karachi', 'controland', 'tehran', 'control', 'only', 'valid', 'method', 'of', 'coordinationwill', 'be', 'via', 'telephone', 'or', 'cellphone', 'communication', 'ufn'], tags=['1']),
 TaggedDocument(words=['in', 'chart', 'standard', 'intrument', 'departure', 'page', 'ad', 'of', 'date', 'days', 'barcelona', 'gral', 'jose', 'antonio', 'anzoategui', 'intlrnav', 'gnss', 'arvex', 'rwy', 'sid', 'osamo', 'transition', 'is', 'suspended'], tags=['0']),
 TaggedDocument(words=['wip', 'wi', 'rectangular', 'areapsn'], tags=['1']),
 TaggedDocument(words=['rnav', '

In [110]:
import sys, random
import multiprocessing

n_notams = len(tagged_notams)

# PARAMS
max_epochs = 50
vec_size = 250
min_count = 2
dm = 1         #0 CBOW / 1 DM
negative = 10
sample = 0     # drop freq threshold
alpha = 0.025

model = Doc2Vec(size=vec_size,
                min_count=min_count,
                dm = dm,
                negative = negative,
                sample = sample,
                workers=multiprocessing.cpu_count())

update_train_epochs = model.iter
model.build_vocab(tagged_notams)

for epoch in range(max_epochs):
    sys.stdout.write('\r' + 'iteration {0}'.format(epoch+1))
    
    random.shuffle(tagged_notams)
    model.train(tagged_notams,
                total_examples=n_notams,
                epochs=update_train_epochs)
    
    # decrease the learning rate
    model.alpha -= 0.002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

d2v_filename = "{epochs}Eps_{up_eps}UpEps_{dim}D_{dm}dm_{mc}mincount_{neg}neg_{sp}sample.d2vmodel".format(epochs=max_epochs,
                                                                                    dim=vec_size,mc=min_count,
                                                                                    up_eps=update_train_epochs,
                                                                                    dm=dm,neg=negative,sp=sample,)
model.save(d2v_filename)
print("\nDoc2vec embeddings Model Saved")



iteration 50
Doc2vec embeddings Model Saved


In [None]:
model= Doc2Vec.load(d2v_filename)


# Manual COSINE SIMILARITY tester ahead 
from scipy import spatial

'''
to find the vector of a document which is not in training data
test = 'PAPI RWY 32 IS RELOCATED AND REVISED GLIDE ANGLE/MEHT AS PER THE MENTIONED DETAILS PAPI RWY 32 LEFT/3.26DEG MEHT/66.71FT PAPI RWY 32 AT DIST OF 387M FM THR OF RWY 32'
test_data = cleaners(word_tokenize(test.lower()))
test_vector= model.infer_vector(test_data)
'''

corr = 0

for notam in notams:
    test_vector = model.infer_vector(cleaners(word_tokenize(notam[0].lower())),steps=20)
    preds = [max(0,1 - spatial.distance.cosine(test_vector, model.docvecs[str(i)])) for i in range(3)]

    if preds.index(max(preds)) == notam[1]:
        corr += 1

print("Manual Cosine Test Acc: ", corr*100/len(notams))

Manual Cosine Test Acc:  31.644049477602977


In [None]:
import numpy as np
model = Doc2Vec.load(d2v_filename)

x_train = []
y_train = []
random.shuffle(notams)

for notam in notams:
    x_train.append(model.infer_vector(cleaners(word_tokenize(notam[0].lower())),steps=100))
    
    labels = "PRIORITY_"+str(notam[1])
    y_train.append(labels)

x_train = np.array(x_train)
y_train = np.array(y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import pickle

print("N SAMPLES: ", len(x_train),len(y_train))

SPLIT_RATIO = 0.9
n_tr = int(n_notams*SPLIT_RATIO)
n_te = n_notams - n_tr

print("SPLITS: Train:{tr} | Test{te}".format(tr=n_tr,te=n_te))

X_train = x_train[:n_tr]*10
Y_train = y_train[:n_tr]
X_test = x_train[-n_tr:]*10
Y_test = y_train[-n_tr:]

logreg_model = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', max_iter = 200)
logreg_model.fit(X_train, Y_train)


train_acc = logreg_model.score(X_train, Y_train)*100
test_acc = logreg_model.score(X_test, Y_test)*100

print("TRAIN ACC: ",train_acc)
print("TEST ACC: ",test_acc)

# Example test item
for i in range(3):
    pick = random.randint(0,n_te)
    print(pick)
    print(logreg_model.predict([X_test[pick]]))
    print(notams[n_te+pick])

y_preds = logreg_model.predict(X_test)
print('Testing accuracy %s' % accuracy_score(Y_test, y_preds))
print('Testing F1 score: {}'.format(f1_score(Y_test, y_preds, average='weighted')))

model_labels = {}
for i in range(len(y_preds)):
    diff = abs(int(y_preds[i][-1]) - int(Y_test[i][-1]))
    try:
        model_labels[diff]
    except:
        model_labels[diff] = 1
    else:
        model_labels[diff] += 1
print(model_labels)

# SAVER
filename = "{train_ac}TRA_{test_ac}TEA.model".format(train_ac=int(train_acc),test_ac=int(test_acc))

pickle.dump(logreg_model, open(filename, 'wb'))
print("Model Saved!")

In [None]:
test = '''PAPI RWY 32 IS RELOCATED AND REVISED GLIDE ANGLE/MEHT AS PER THE MENTIONED DETAILS PAPI RWY 32 LEFT/3.26DEG MEHT/66.71FT PAPI RWY 32 AT DIST OF 387M FM THR OF RWY 32'''
test = re.sub(r'[^\w\s]','',test)
test = " ".join(test.split('\n'))
lines = ""
for line in test:
    if "EAIP" not in line:
        lines += line
test = re.sub(" +"," ",lines)
for sw in stopwords[::-1]:
    test = re.sub(sw," ",test)
test = re.sub(' +',' ',test)
print(cleaners(word_tokenize(test.lower())))
print(logreg_model.predict([model.infer_vector(cleaners(word_tokenize(test.lower())),steps=1000)]))