## Python Cleaning Up Text Files

In [3]:
with open('data\\Veidenbaums.txt') as fin:
    mylines = fin.readlines()

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 13: character maps to <undefined>

In [2]:
# MAC ONLY and LINUX
with open('data/Veidenbaums.txt', encoding='utf-8') as fin:
    mylines = fin.readlines()

In [5]:
# Windows
with open('data/Veidenbaums.txt', encoding='utf-8') as fin:
    mylines = fin.readlines()

In [4]:
len(mylines)

971

In [9]:
# Universal Operating System module library
import os

In [6]:
fname = "Veidenbaums.txt"

In [10]:
fpath = os.path.join(os.getcwd(), 'data', fname)
fpath

'C:\\Users\\Admin\\Documents\\Github\\RCS_Data_Analysis_Python_2019\\data\\Veidenbaums.txt'

In [21]:
import os
newpath = os.path.join(os.getcwd(), 'data', 'cleaned.txt')
newpath

'C:\\Users\\Admin\\Documents\\Github\\RCS_Data_Analysis_Python_2019\\data\\cleaned.txt'

In [None]:
badwords = ['***']

In [12]:
mylines[:5]

['\n', '\n', 'Pēc ideāliem cenšas lielie gari***\n', '\n', '\n']

In [13]:
cleanlines = []
for line in mylines:
    if '***' not in line:
        cleanlines.append(line)

In [14]:
len(cleanlines)

945

In [15]:
cleanlines2 = [line for line in mylines if '***' not in line]

In [17]:
len(cleanlines2)

945

In [18]:
badwords = ['***', "reallybadword"]

In [19]:
# universal way to clean lines of any bad words
with open(fpath, encoding='utf-8') as oldfile, open(newpath, 'w', encoding='utf-8') as newfile:
    for line in oldfile:
        if not any(badword in line for badword in badwords):
            newfile.write(line)


In [20]:
newpath

'C:\\Users\\Admin\\Documents\\Github\\RCS_Data_Analysis_Python_2019\\data\\cleaned.txt'

In [22]:
with open(newpath, encoding='utf-8') as f:
    txt=f.read()
len(txt)

12129

In [24]:
txt[:100],txt[-100:]

('\n\n\n\n\n\nPēc ideāliem cenšas lielie gari,\n\nBet dzīvē ieņemt vietu pirmie\n\nTie neiespēj, tos nomāc maize',
 'a, cik siena viņas dos,\n\nLai mani nelasa. Pie manām dzejām\n\nTik piktu prātu viņš sev iemantos.\n\n\n\n\n\n')

# Exercise 

1. Saskaitīt tekstā sastopamo vārdu biežumu
2. Izdrukāt unikālo vārdu skaitu
3. Izdrukāt 20. biežāk sastopamo vārdu sarakstu
4. Kādi ir tekstā visretāk sastopamie vārdi?

In [25]:
# using string replace function to clean text of bad data
badchars = ',;:!?."\'-'
for char in badchars:
    txt = txt.replace(char, '') # suggestion to use regular expressions for heavier tasks
len(txt)

11626

In [26]:
txt[:100]

'\n\n\n\n\n\nPēc ideāliem cenšas lielie gari\n\nBet dzīvē ieņemt vietu pirmie\n\nTie neiespēj tos nomāc maizes '

In [None]:
# Alex Martelli's extra whitespace cleaner
# ' '.join(mystring.split())

In [28]:
# without arguments will use common separators https://docs.python.org/3/library/stdtypes.html
words = txt.split()
len(words)

1870

In [33]:
mydict = {}
for word in words:
    word = word.lower()
    if word in mydict.keys():
        mydict[word] += 1
    else:
        mydict[word] = 1

In [34]:
len(mydict)

1072

In [36]:
mycount = list(mydict.items())

In [37]:
type(mycount)

list

In [38]:
mycount[0]

('pēc', 11)

In [None]:
type(mycount[0])

In [39]:
cwords = [word.title() for word in words]
len(cwords)

1870

In [None]:
vdict = { word: cwords.count(word) for word in set(cwords)}
len(vdict)

In [None]:
list(vdict.keys())[:5]

In [None]:
max(vdict, key = lambda k: vdict[k])

In [None]:
vdict['Un']

In [None]:
sortedWords = sorted(vdict.items(), key = lambda x: x[1])
len(sortedWords)

In [None]:
type(vdict.items())

In [None]:
sortedWords.reverse()

In [None]:
sortedWords[:10]

In [None]:
sortedWords[:50]

In [40]:
# we can count with Counter no need to make our own dictionary!
from collections import Counter

In [41]:
cnt = Counter(cwords)

In [42]:
type(cnt)

collections.Counter

In [44]:
cnt.most_common(50)

[('Un', 76),
 ('Ir', 24),
 ('Tik', 21),
 ('Vēl', 21),
 ('Tu', 21),
 ('Bet', 15),
 ('Kas', 15),
 ('Nav', 14),
 ('Man', 14),
 ('Par', 13),
 ('Kā', 13),
 ('Kur', 13),
 ('Lai', 12),
 ('Pēc', 11),
 ('Ar', 11),
 ('Tev', 11),
 ('Kam', 11),
 ('Tie', 10),
 ('Ka', 10),
 ('Uz', 10),
 ('Reiz', 10),
 ('Būs', 9),
 ('Es', 9),
 ('Viss', 9),
 ('No', 9),
 ('Pie', 9),
 ('Tur', 9),
 ('Vai', 9),
 ('Kad', 8),
 ('Tad', 8),
 ('Līdz', 8),
 ('Cik', 7),
 ('Mums', 7),
 ('Sauc', 6),
 ('Ja', 6),
 ('Bij', 6),
 ('Tās', 6),
 ('Iet', 6),
 ('Gars', 6),
 ('Projām', 6),
 ('Gan', 5),
 ('Laiks', 5),
 ('Sirds', 5),
 ('Nu', 5),
 ('Jau', 5),
 ('Tā', 5),
 ('Tomēr', 5),
 ('Likumīgi', 5),
 ('Dzīves', 5),
 ('Iedzer', 5)]

In [None]:
cnt.most_common(10)

In [None]:
# Reading files from URL
# https://docs.python.org/3.7/library/urllib.request.html#module-urllib.request