/
createNormalDictionary.py
151 lines (131 loc) · 5.44 KB
/
createNormalDictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# -*- coding: utf-8 -*-
"""
Created March 2018
@author: lisam
This program creates a normal dictionary sorted on parts of speech
(assuming one doesn't already exist saved as 'normalDictionary.txt'.
It uses the brown corpus from nltk to load the words. In our case we just used
the entire corpus in our readability project.
Input: a flag that's 'True' if the pos tagger is spacy and 'None' for wordnet
Output: A list of tuples, first part is the pos abbreviation, i.e. 'NN' for noun,
the second part is a list of the nouns found.
(there are more detailed comments throughout the procedure in 'createCyberDictionary.txt')
"""
import nltk
import json
import string
import os
from os.path import exists
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk import word_tokenize
from nltk.corpus import brown
spacyFlag = None # None for wordnet
folderName = 'dictionaryTexts'
cwd = os.getcwd()
distractorDictPath = os.path.join(cwd, folderName)
def getDict(flag):
global spacyFlag
spacyFlag = flag # set for spacy dictionary
if (flag):
import spacy
normalPOSlib = [('CC',[]),('CD',[]),('DT',[]),('EX',[]),('FW',[]),('IN',[]),('JJ',[]),('JJR',[]),('JJS',[]),('LS',[]),('MD',[]),('NN',[]),('NNP',[]),('NNPS',[]),('NNS',[]),('PDT',[]),('POS',[]),('PRP',[]),('PRP$',[]),('RB',[]),('RBR',[]),('RBS',[]),('RP',[]),('TO',[]),('UH',[]),('VB',[]),('VBD',[]),('VBG',[]),('VBN',[]),('VBP',[]),('VBZ',[]),('WDT',[]),('WP',[]),('WP$',[]),('WRB',[])]
# make this a load normal dictionary instead of create each time
fname = os.path.join(cwd, 'normalDictionary.txt')
if exists(fname):
with open(fname,'r') as file:
content = file.read().splitlines()
i = 0
while (i < len(content)):
for item in normalPOSlib:
if (item[0] == content[i]):
i += 1
poses = content[i].split(' ')
for pos in poses:
item[1].append(pos)
i += 1
else:
tagged = []
if (spacyFlag == True):
nlp = spacy.load('en')
#sentence = unicode(text, "utf-8")
charCount = 0
for curr in brown.words():
doc = nlp(curr)
for token in doc:
tagged.append((token.text, token.tag_))
else:
for curr in brown.words():
tokenized = word_tokenize(curr)
# get POS for every word
tag = pos_tag(tokenized)
tagged.append(tag)
#assign to a list
for val in tagged:
for pos in normalPOSlib:
if (pos[0] == val[1]): # the word is that pos
if (val[0].lower() not in pos[1]):
pos[1].append(val[0].lower())
normalPOSlib = weed_library(normalPOSlib)
# once done, save to file
with open(fname,'w') as file:
for ele in normalPOSlib:
file.write(ele[0]+'\n')
for item in ele[1]:
file.write(item+' ')
file.write('\n')
#print (normalPOSlib)
return (normalPOSlib)
def weed_library(posLib):
punctuations = '''``'!!!!!()-[]{};;;:'"\,,<>....|||/?@#$%^&*_~'''
tempLib = [('CC',[]),('CD',[]),('DT',[]),('EX',[]),('FW',[]),('IN',[]),('JJ',[]),('JJR',[]),('JJS',[]),('LS',[]),('MD',[]),('NN',[]),('NNP',[]),('NNPS',[]),('NNS',[]),('PDT',[]),('POS',[]),('PRP',[]),('PRP$',[]),('RB',[]),('RBR',[]),('RBS',[]),('RP',[]),('TO',[]),('UH',[]),('VB',[]),('VBD',[]),('VBG',[]),('VBN',[]),('VBP',[]),('VBZ',[]),('WDT',[]),('WP',[]),('WP$',[]),('WRB',[])]
for pos in posLib:
for temp in tempLib:
if (pos[0] == temp[0]): # once they're at the same pos
count = 0
for item in pos[1]:
if (item not in punctuations and item != 'n\'t' and item != ' ' and len(item) > 0): # not punctuation
for char in item:
if (char in punctuations):
count += 1
if (count < 2):
temp[1].append(item)
#else:
# print ('shouldn\'t be added')
return tempLib
# what each POS means
# CC- conjunction, coordinating
# CD- numeral, cardinal
# DT- determiner
# EX- existential there
# FW- idk what this is ********************************
# IN- preposition or conjunction, subordinating
# JJ- adjective or numeral, ordinal
# JJR - adjective, comparative
# JJS- adjective, superlative
# LS- list item marker
# MD - modal auxiliary
# NN - noun, common, singular or mass
# NNP- noun, proper, singular
# NNPS- noun, proper, not singular
# NNS- noun, common, plural
# PDT- pre determiner
# POS- genitive marker
# PRP- pronoun, personal
# PRP$- pronoun, possessive
# RB- adverb
# RBR- adverb, comparative
# RBS- adverb, superlative
# RP- particle
# TO- "to" as a preposition or infinite marker
# UH- interjection
# VB- verb, base format
# VBD- verb, past tense
# VBG- verb, present participle or gerund
# VBN- verb, past participle
# VBP- verb, present tense, not 3rd person singular
# VBZ- verb, present tense, 3rd person singular
# WDT- WH-determiner
# WP- WH-pronoun
# WP$- WH-pronoun, possessive
# WRB- WH-adverb