## Python fundamentals

In [32]:
# Extract information from a nested python data sturcture
data = {
    "doc1": [
        {"text": "Hello world!", "meta": {"speaker": "A", "keep": True}},
        {"text": "Skip me",       "meta": {"speaker": "B", "keep": False}},
    ],
    "doc2": [
        {"text": "Another line.", "meta": {"speaker": "A", "keep": True}},
        {"text": "Edge: empty",   "meta": {"speaker": "C", "keep": True}},
    ],
}


In [97]:
def collect_text(data, speaker):
    output = []
    for i in data.values():
        for j in i:
            temp = j.get('meta')
            if temp.get('speaker') == speaker and temp.get('keep') == True:
                output.append(j.get('text'))


    return "\n".join(output)

# quick test
result = collect_text(data, speaker = "A")
print(result)


Hello world!
Another line.


In [None]:
# Open a text file for reading/writing with appropriate encoding
def process_file(input_path, output_path):
    with open(input_path, 'r', encoding = 'utf-8') as input, open(output_path, 'w', encoding = 'utf-8') as output:
        for line in input:
            line = line.strip()
            line = line.upper()
            output.write(line + '\n')


# quick test
n = process_file("test.txt", "output.txt")
print("Lines written:", n)

Lines written: None


In [48]:
# Create a dictionary of counts (e.g. word counts)
text = 'This is a test. This test is simple.'

In [71]:
count = {}
for word in text.split():
    temp = []
    for letter in word:
        if letter.isalpha() == True:
            temp.append(letter)
    processed_word = ''.join(temp).lower()
    if processed_word not in count.keys():
        count[processed_word] = 0
        count[processed_word] += 1
    else:
        count[processed_word] += 1
    
# or use counter 

from collections import Counter
count = Counter([i.lower() for i in text.split()])
count

Counter({'this': 2, 'is': 2, 'a': 1, 'test.': 1, 'test': 1, 'simple.': 1})

In [77]:
# Convert a dictionary of counts into a probability distribution
counts = {
    'this': 2,
    'is': 2,
    'a': 1,
    'test': 2,
    'simple': 1
}

In [80]:
probs = {}
for i in counts.keys():
    probs[i] = counts.get(i)/sum(counts.values())
probs

{'this': 0.25, 'is': 0.25, 'a': 0.125, 'test': 0.25, 'simple': 0.125}

In [None]:
# Sorting words based on their counts stored in a dictionary
# Becareful with the tie
counts = {'b': 2, 'a': 2, 'c': 1}

In [101]:
sorted(counts, key = lambda x: (-counts[x], x), reverse = False)

['a', 'b', 'c']

In [104]:
# Use NLTK to do sentence segmentation and/or word tokenization of a text
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
text = "Hi! I'm Tianhao. I study at UBC (MDS). Do you like NLP?"

In [105]:
sent_tokenize(text)

['Hi!', "I'm Tianhao.", 'I study at UBC (MDS).', 'Do you like NLP?']

In [106]:
word_tokenize(text)

['Hi',
 '!',
 'I',
 "'m",
 'Tianhao',
 '.',
 'I',
 'study',
 'at',
 'UBC',
 '(',
 'MDS',
 ')',
 '.',
 'Do',
 'you',
 'like',
 'NLP',
 '?']

In [107]:
# Use NLTK to do POS tagging of one or more tokenized sentences
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

text = "I like natural language processing."

In [111]:
pos_tag(word_tokenize(text))

[('I', 'PRP'),
 ('like', 'VBP'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('.', '.')]

In [151]:
# Use string methods/operators to extract a substring
s = "ID=12345;NAME=Tianhao;ROLE=student"

for part in s.split(';'):
    if part.startswith("NAME="):
        name = part.split('=', 1)[1]
name


'Tianhao'

In [160]:
#Use string methods/operators to identify/filter a particular kind of string
strings = [
    "apple",
    "Banana",
    "123",
    "hello!",
    "NLP",
    "data_science",
    "42nd"
]

[i for i in strings if i.isalpha()]

['apple', 'Banana', 'NLP']

In [162]:
# Use string methods/operators to generate a larger string
tokens = ["I", "love", "NLP"]
' '.join(tokens) + '!'

'I love NLP!'

In [164]:
# Convert numerical strings to ints/floats and vice versa
nums_as_strings = ["10", "3.14", "42", "0.5"]

output = []
for num in nums_as_strings:
    if '.' in num:
        output.append(float(num))
    else:
        output.append(int(num))

output_2 = []
for num in output:
    output_2.append(str(num))

output_2

['10', '3.14', '42', '0.5']

In [165]:
# Writing a string literal and/or regex with appropriate escaping
import re

text = "The file is called data_v1.0.txt and saved in /home/user/docs."

# TODO: write the regex pattern with correct escaping
pattern = r'data_v1\.0\.txt'

match = re.search(pattern, text)
print(match.group() if match else "No match")


data_v1.0.txt


In [166]:
# Code a simple regex (difficulty level similar to 1.1 or 1.2 on lab 3)
positive_examples = ["00:00", "09:15", "12:30", "23:59"]
negative_examples = ["24:00", "23:60", "7:05", "07:5", "00:000", "12-30", "123:45"]

pattern = r'\b(?:[01][0-9]|2[0-3]):[0-5][0-9]\b'

In [167]:
# Look for a match or multiple matches in a string or strings using a regex
text = """
I arrived on 2024/01/17.
My previous visits were on 1977/10/25 and 2012/03/09.
An invalid date like 2024/13/40 should not count.
"""

pattern = r'\b(?:1[0-9]{3}|2[0-9]{3})/(?:0[1-9]|1[0-2])/(?:0[1-9]|[12][0-9]|3[01])\b'

In [171]:
# Iterate over the sentences and/or words of an NLTK corpus
import nltk
from nltk.corpus import brown
for sent in brown.sents():
    print(sent)

for word in brown.words():
    print(word)

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']
['``', 'Only', 'a', 'relative', 'handful', 'of', 'such', 'reports

KeyboardInterrupt: 

In [None]:
# Get a set corresponding to the vocabulary of an NLTK corpus
import nltk
from nltk.corpus import brown

# If needed once:
# nltk.download("brown")

# TODO:
# create a set of unique words in the corpus

vocab = set(brown.words())

print(len(vocab))
print(list(vocab)[:10])   # just inspect a few items


56057
["Where'd", 'indentations', 'superior', 'pitches', 'beep', 'trails', 'Pp.', 'abstracted', 'psychologists', 'Cotton']


In [224]:
# Get a set corresponding to an NLTK lexicon
import nltk
from nltk.corpus import wordnet as wn

# If needed once:
# nltk.download("wordnet")

# TODO:
# create a set of words from the WordNet lexicon

lexicon_set = set(wn.words())


In [226]:
# Get the intersection/differences of lexicons.
lexicon_set.intersection(vocab)

{'superior',
 'beep',
 'abstracted',
 'electrically',
 'sand',
 'uptake',
 'bewitched',
 'piping',
 'overexploitation',
 'broadening',
 'repentant',
 'igneous',
 'kindness',
 'crypt',
 'mom',
 'linoleum',
 'painful',
 'veranda',
 'oiled',
 'racketeer',
 'quantity',
 'negation',
 'meaty',
 'controversial',
 'clothier',
 'hockey',
 'check',
 'income',
 'using',
 'exquisite',
 'downfall',
 'unite',
 'dilettante',
 'palatability',
 'introverted',
 'swim',
 'polyester',
 'twisted',
 'believably',
 'vehemently',
 'nobleman',
 'scope',
 'unidentified',
 'anatomical',
 'survive',
 'idealism',
 'obliterated',
 'lumpish',
 'natty',
 'singing',
 'sample',
 'agile',
 'wiggle',
 'misleading',
 'leadership',
 'seamless',
 'sculptor',
 'metronome',
 'loon',
 'severing',
 'unheeding',
 'flare',
 'lying',
 'hydrolysis',
 'localized',
 'immunization',
 'orthodontics',
 'contrived',
 'represent',
 'ordnance',
 'reinstall',
 'veridical',
 'navigate',
 'credited',
 'tangled',
 'jewel',
 'congeniality',
 'g

In [227]:
# Load an XML document from the web into BeautifulSoup or lxml
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://www.w3schools.com/xml/note.xml"
response = urlopen(url)
soup = BeautifulSoup(response, "xml")

print(soup)


<?xml version="1.0" encoding="utf-8"?>
<note>
<to>Tove</to>
<from>Jani</from>
<heading>Reminder</heading>
<body>Don't forget me this weekend!</body>
</note>


In [216]:
# Look for particular nodes in an XML document tree
soup.find('heading')
soup.find_all('heading')

soup.find("heading", {"type": "reminder"})
note = soup.find("note")
heading = note.find("heading")
heading

<heading>Reminder</heading>

In [None]:
# Access the attributes and/or text for in XML
heading = soup.find("heading")
print(heading.text)

print(heading.string)

print(heading.get("type"))

for h in soup.find_all("heading"):
    print(h.get("type"), h.text)

Reminder
Reminder
None
