This repository has been archived by the owner on Apr 30, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tommy_final.py
65 lines (44 loc) · 1.57 KB
/
tommy_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# coding: utf-8
# In[5]:
from google_ngram_downloader import readline_google_store
import operator
import nltk
import re
import string
total = {}
for n in range(1,5):
fname, url, records = next(readline_google_store(ngram_len = n, lang = 'chi-sim'))
notEoF = True
try:
while notEoF:
myrecord = next(records)
temp = ""
for i in myrecord[0] :
if re.search(r'[A-Za-z0-9\s\_]', i):
pass
#print("yes")
else:
temp += i
#print(c)
temp = re.sub('[%s]' % re.escape(string.punctuation), '', temp,re.UNICODE) #Gets rid of punctuations
if len(temp) == 4:
num = myrecord[2] #the number of times it was shown up
if temp in total: #if the ngram is in our total dictionary
total[temp] += num # add the number of times it is repeated in this new instance
else:
total[temp] = num # adding the ngram to our dictionary
except:
notEoF = False
sorted_x = sorted(total.items(), key=operator.itemgetter(1),reverse=True)
top_100 = sorted_x[0:100] # change the numbers in brackets to expand the list. For instance, [0:10] shows the top 10
top_100
f = open("top_100_updated.txt","w",encoding='utf-8')
for ngram in top_100:
f.write(ngram[0]+","+str(ngram[1])+"\n")
f.close()
# In[6]:
g = open("total.txt","w",encoding='utf-8')
for ngram in sorted_x:
g.write(ngram[0]+","+str(ngram[1])+"\n")
g.close()
# In[ ]: