## Python Cleaning Up Text Files

In [1]:
import os

In [2]:
fname = "Veidenbaums.txt"

In [3]:
fpath = os.path.join(os.getcwd(), 'data', fname)
fpath

'C:\\Users\\val-p1\\Github\\RCS_Data_Analysis_Python_2019_July\\data\\Veidenbaums.txt'

In [4]:
newpath = os.path.join(os.getcwd(), 'data', 'cleaned.txt')
newpath

'C:\\Users\\val-p1\\Github\\RCS_Data_Analysis_Python_2019_July\\data\\cleaned.txt'

In [5]:
with open(fpath) as f:
    mylist = f.readlines()
len(mylist)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 13: character maps to <undefined>

In [None]:
print("Hello Python")

In [6]:
!python --version

Python 3.7.3


In [8]:
# we can check filesize first if we are worried about reading whole file into our RAM
os.path.getsize(fpath)

14885

In [9]:
with open(fpath, encoding="utf-8") as f:
    mylist = f.readlines()
len(mylist)

971

In [10]:
with open(fpath, encoding="utf-8") as f:
    rawtext = f.read()
len(rawtext)

12875

In [11]:
mylist[:10]

['\n',
 '\n',
 'Pēc ideāliem cenšas lielie gari***\n',
 '\n',
 '\n',
 '\n',
 '\n',
 'Pēc ideāliem cenšas lielie gari,\n',
 '\n',
 'Bet dzīvē ieņemt vietu pirmie\n']

In [13]:
mylist[:50]

['\n',
 '\n',
 'Pēc ideāliem cenšas lielie gari***\n',
 '\n',
 '\n',
 '\n',
 '\n',
 'Pēc ideāliem cenšas lielie gari,\n',
 '\n',
 'Bet dzīvē ieņemt vietu pirmie\n',
 '\n',
 'Tie neiespēj, tos nomāc maizes kari,\n',
 '\n',
 'Tos nomāc aizspriedumi sirmie.\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 'Virs zemes nav taisnības***\n',
 '\n',
 '\n',
 '\n',
 '\n',
 'Virs zemes nav taisnības, dūrei tik spēks,\n',
 '\n',
 'Kas varmākām skādi dar, nosaukts tiek grēks.\n',
 '\n',
 'Par tiesnešiem cienīti blēži sēž\n',
 '\n',
 'Un godīgie ādu nost citiem plēš.\n',
 '\n',
 'Un cienīgs tēvs, zaglis, teic sprediķus;\n',
 '\n',
 '"Tik pacieties, debesīs labāki būs!"\n',
 '\n',
 'Virs zemes nav laimes, tik zvēru pulks bļauj,\n',
 '\n',
 'Viens otram iz mutes tie maizi sev rauj,\n',
 '\n',
 'Un priecīgs ikkatris, kad vēders tik pilns,\n',
 '\n',
 'Kad bērni ir veseli, dzīvoklis silts.\n',
 '\n',
 'Un glaimojot salkušie rakstnieki sauc:\n',
 '\n',
 '"Cik praktiska tauta! Tai cerību daudz."\n',
 '\n',
 'Jā, c

In [12]:
mylist[-10:]

['Tik apskaita, cik siena viņas dos,\n',
 '\n',
 'Lai mani nelasa. Pie manām dzejām\n',
 '\n',
 'Tik piktu prātu viņš sev iemantos.\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n']

In [14]:
stars = "***"
stars

'***'

In [15]:
stars in mylist[0]

False

In [16]:
stars in mylist[2]

True

In [17]:
rawtext.count(stars)

26

In [18]:
total = 0
lcount = 0
for line in mylist:
    if stars in line:
        total += 1
    lcount += 1
print("Total lines", lcount)
print("Starred lines", total)

Total lines 971
Starred lines 26


In [19]:
# for those times when we need an index, we can use enumerate
total = 0
for i, line in enumerate(mylist):
    if stars in line:
        total += 1
        print("Found ",stars, "on Line:", i)

print("Total lines", i)
print("Starred lines", total)

Found  *** on Line: 2
Found  *** on Line: 19
Found  *** on Line: 68
Found  *** on Line: 101
Found  *** on Line: 230
Found  *** on Line: 271
Found  *** on Line: 328
Found  *** on Line: 365
Found  *** on Line: 382
Found  *** on Line: 423
Found  *** on Line: 452
Found  *** on Line: 493
Found  *** on Line: 526
Found  *** on Line: 551
Found  *** on Line: 584
Found  *** on Line: 613
Found  *** on Line: 674
Found  *** on Line: 707
Found  *** on Line: 732
Found  *** on Line: 753
Found  *** on Line: 775
Found  *** on Line: 820
Found  *** on Line: 853
Found  *** on Line: 894
Found  *** on Line: 927
Found  *** on Line: 954
Total lines 970
Starred lines 26


In [21]:
badwords = ['***','ReallyBadWord']

In [22]:
# This recipe will clean small,big and huge files 
# from all lines containing bad words in the list badwords
with open(fpath, encoding='utf-8') as oldfile, open(newpath, mode='w', encoding='utf-8') as newfile:
    for line in oldfile:
        if not any(badword in line for badword in badwords):
            newfile.write(line)


In [23]:
with open(newpath, encoding='utf-8') as f:
    txt=f.read()
len(txt)

12129

In [24]:
len(rawtext),len(txt)

(12875, 12129)

# Exercise 

1. Saskaitīt tekstā sastopamo vārdu biežumu
2. Izdrukāt unikālo vārdu skaitu
3. Izdrukāt 20. biežāk sastopamo vārdu sarakstu
4. Kādi ir tekstā visretāk sastopamie vārdi?

In [25]:
mylist[100:130]

['\n',
 'Domāju es domas dziļas***\n',
 '\n',
 '\n',
 '\n',
 '\n',
 'Domāju es domas dziļas,\n',
 '\n',
 'Kādēļ laime mani nīd;\n',
 '\n',
 'Viss, ko daru, viss man viļas,\n',
 '\n',
 'Visās vietās kājas slīd.\n',
 '\n',
 'Citam nav nekāda darba,\n',
 '\n',
 'Nauda kā no gaisa birst.\n',
 '\n',
 'Man ir dzīve sūra, skarba,\n',
 '\n',
 'Jābadojas, kamēr mirst.\n',
 '\n',
 'Nav pie dvēseles ne graša,\n',
 '\n',
 'Prasi kādam - kas tev dos?\n',
 '\n',
 'Labāk vienreiz nāve aša\n',
 '\n',
 'Nekā dzīve nabagos!\n',
 '\n']

In [26]:
badchars = ',;:!?."\'-'
badchars

',;:!?."\'-'

In [27]:
newname = 'Valdis'.replace('al', 'od')
newname

'Voddis'

In [None]:
newname.replace()

In [28]:
# with this recipe we can clear the text of all bad chars
for char in badchars:
    print("Cleaning text from ", char)
    txt = txt.replace(char, '') # suggestion to use regular expressions for heavier tasks
len(txt)

Cleaning text from  ,
Cleaning text from  ;
Cleaning text from  :
Cleaning text from  !
Cleaning text from  ?
Cleaning text from  .
Cleaning text from  "
Cleaning text from  '
Cleaning text from  -


11626

In [29]:
txt[:50]

'\n\n\n\n\n\nPēc ideāliem cenšas lielie gari\n\nBet dzīvē i'

In [None]:
# Alex Martelli's extra whitespace cleaner
# ' '.join(mystring.split())

In [30]:
words = txt.split()
len(words)

1870

In [31]:
words[:10]

['Pēc',
 'ideāliem',
 'cenšas',
 'lielie',
 'gari',
 'Bet',
 'dzīvē',
 'ieņemt',
 'vietu',
 'pirmie']

In [32]:
words[-10:]

['nelasa',
 'Pie',
 'manām',
 'dzejām',
 'Tik',
 'piktu',
 'prātu',
 'viņš',
 'sev',
 'iemantos']

In [33]:
nums = list(range(10))
nums

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [34]:
numtxt = str(nums)
numtxt

'[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]'

In [38]:
mynums = numtxt.split(", ")
mynums

['[0', '1', '2', '3', '4', '5', '6', '7', '8', '9]']

In [39]:
type(mynums)

list

In [40]:
type(mynums[0])

str

In [41]:
"Valdis".lower()

'valdis'

In [42]:
# generate a list with all words in lowercase from words list 
newlist = []
for word in words:
    newlist.append(word.lower())
print(len(newlist))
    

1870


In [43]:
nlist = [w.lower() for w in words]
len(nlist)

1870

In [44]:
newlist[:10]

['pēc',
 'ideāliem',
 'cenšas',
 'lielie',
 'gari',
 'bet',
 'dzīvē',
 'ieņemt',
 'vietu',
 'pirmie']

In [None]:
# List Comprehension
nlist = [word.lower() for word in words]
nlist

In [45]:
cwords = [word.title() for word in words]
len(cwords)

1870

In [46]:
nlist.count('alus')

2

In [47]:
uniq = set(nlist)
len(uniq)

1072

In [None]:
mytemp = { 'alus' : nlist.count('alus')}
mytemp

In [48]:
# Dictionary comprehension
mydict = { word : nlist.count(word) for word in set(nlist)}
len(mydict)

1072

In [49]:
mydict['alus']

2

In [50]:
newdict = {}
for word in nlist:
    if word in newdict.keys():
        newdict[word] += 1
    else:
        newdict[word] = 1
len(newdict)

1072

In [51]:
newdict['alus']

2

In [52]:
sorted(mydict.items())[:10]

[('(vēstule', 1),
 ('1890', 1),
 ('5', 1),
 ('acs', 1),
 ('agrāk', 1),
 ('aiz', 1),
 ('aizgāja', 1),
 ('aizmirsts', 1),
 ('aizspriedumi', 1),
 ('ak', 1)]

In [None]:
# the el[1] refers to second element of the tuple in mydict.items() list
sortedfrequency = sorted(mydict.items(),key=lambda el: el[1], reverse=True)


In [None]:
sorted(list('kartupelis'))

In [None]:
sortedfrequency[:10]

In [None]:
somewords = [item[0] for item in longcnt.most_common(10)]
somewords

In [None]:
sorted(somewords)

In [None]:
def mycomparison(el):
    return el[::-1]

In [None]:
sorted(somewords, key = mycomparison)

In [None]:
# in lambda functions return is implied and comes right after the first : 
sorted(somewords, key = lambda el: el[::-1])

In [None]:
vdict = { word: cwords.count(word) for word in set(cwords)}
len(vdict)

In [None]:
list(vdict.keys())[:5]

In [None]:
max(vdict, key = lambda k: vdict[k])

In [None]:
vdict['Un']

In [None]:
sortedWords = sorted(vdict.items(), key = lambda x: x[1])
len(sortedWords)

In [None]:
type(vdict.items())

In [None]:
sortedWords.reverse()

In [None]:
sortedWords[:10]

In [None]:
sortedWords[:50]

In [53]:
from collections import Counter

In [55]:
wcount = Counter('abracadabra')
wcount.most_common()

[('a', 5), ('b', 2), ('r', 2), ('c', 1), ('d', 1)]

In [54]:
ncount = Counter(nlist)
ncount.most_common(20)

[('un', 76),
 ('ir', 24),
 ('tik', 21),
 ('vēl', 21),
 ('tu', 21),
 ('bet', 15),
 ('kas', 15),
 ('nav', 14),
 ('man', 14),
 ('par', 13),
 ('kā', 13),
 ('kur', 13),
 ('lai', 12),
 ('pēc', 11),
 ('ar', 11),
 ('tev', 11),
 ('kam', 11),
 ('tie', 10),
 ('ka', 10),
 ('uz', 10)]

In [56]:
# we filter out all words that are shorter than 3 or less characters
longlist = [word for word in nlist if len(word) > 3]

In [57]:
len(longlist)

1288

In [58]:
longcnt = Counter(longlist)
longcnt.most_common(20)

[('reiz', 10),
 ('viss', 9),
 ('līdz', 8),
 ('mums', 7),
 ('sauc', 6),
 ('gars', 6),
 ('projām', 6),
 ('laiks', 5),
 ('sirds', 5),
 ('tomēr', 5),
 ('likumīgi', 5),
 ('dzīves', 5),
 ('iedzer', 5),
 ('tiek', 4),
 ('daudz', 4),
 ('vaigs', 4),
 ('kājām', 4),
 ('pasaules', 4),
 ('bija', 4),
 ('kaut', 4)]

In [59]:
type(longcnt)

collections.Counter

In [66]:
type(longcnt.most_common())

list

In [67]:
commonlongwords = longcnt.most_common(20)

In [68]:
commonlongwords

[('reiz', 10),
 ('viss', 9),
 ('līdz', 8),
 ('mums', 7),
 ('sauc', 6),
 ('gars', 6),
 ('projām', 6),
 ('laiks', 5),
 ('sirds', 5),
 ('tomēr', 5),
 ('likumīgi', 5),
 ('dzīves', 5),
 ('iedzer', 5),
 ('tiek', 4),
 ('daudz', 4),
 ('vaigs', 4),
 ('kājām', 4),
 ('pasaules', 4),
 ('bija', 4),
 ('kaut', 4)]

In [64]:
longlist[:10]

[('ideāliem', 1),
 ('cenšas', 1),
 ('lielie', 1),
 ('gari', 1),
 ('dzīvē', 2),
 ('ieņemt', 1),
 ('vietu', 1),
 ('pirmie', 1),
 ('neiespēj', 1),
 ('nomāc', 2)]

In [72]:
# we can update counter values manually it will add new values
longcnt.update({'sirds':13})

In [73]:
longcnt.most_common(10)

[('sirds', 34),
 ('reiz', 10),
 ('viss', 9),
 ('līdz', 8),
 ('mums', 7),
 ('sauc', 6),
 ('gars', 6),
 ('projām', 6),
 ('laiks', 5),
 ('tomēr', 5)]

In [74]:
longcnt['sirds'] += 3

In [75]:
longcnt.most_common(5)

[('sirds', 37), ('reiz', 10), ('viss', 9), ('līdz', 8), ('mums', 7)]

In [76]:
longcnt.update({'sirds':20, 'gars':50})
longcnt.most_common(10)

[('sirds', 57),
 ('gars', 56),
 ('reiz', 10),
 ('viss', 9),
 ('līdz', 8),
 ('mums', 7),
 ('sauc', 6),
 ('projām', 6),
 ('laiks', 5),
 ('tomēr', 5)]

In [None]:
cnt = Counter(nlist)

In [None]:
cnt.most_common(10)

In [None]:
cnt.most_common(50)

In [None]:
# Reading files from URL
# https://docs.python.org/3.7/library/urllib.request.html#module-urllib.request

In [79]:
from collections import Counter

In [80]:
wcnt = Counter("abbbbbbaabbbccdfdfad")
wcnt.most_common()

[('b', 9), ('a', 4), ('d', 3), ('c', 2), ('f', 2)]