# Problem 1: Text Preprocessing

In [None]:
input_str = "Hello, I'm a member of AI VIETNAM AIO 2022 courses."

### Task 1: chuyển chữ viết thường

In [None]:
def lowercase(text):
  return text.lower()

print(lowercase(input_str))

hello, i'm a member of ai vietnam aio 2022 courses.


### Task 2: chuyển chữ viết hoa

In [None]:
def uppercase(text):
  return text.upper()

print(uppercase(input_str))

HELLO, I'M A MEMBER OF AI VIETNAM AIO 2022 COURSES.


### Task 3: xóa đường dẫn url

In [None]:
def remove_url(text):
  return " ".join([w for w in text.split() if not (w.startswith("http://") or w.startswith("https://"))])

print(remove_url("Hello this is the facebook url: https://www.facebook.com"))

Hello this is the facebook url:


### Task 4: xóa các thẻ html

In [None]:
!pip install bs4 -q

In [None]:
import re
from bs4 import BeautifulSoup

# method 1
def remove_html_tags(text):
  soup = BeautifulSoup(text)
  return soup.get_text()

# method 2
def remove_html_tags(text):
  return re.sub('<.*?>', '', text)

print(remove_html_tags("<h>this is an html tags:</h> <p>ai vietnam</p>"))

this is an html tags: ai vietnam


### Task 5: xóa dấu câu

In [None]:
import string

def remove_puncts(text):
  return text.translate(text.maketrans("", "", string.punctuation))

print(remove_puncts(input_str))

Hello Im a member of AI VIETNAM AIO 2022 courses


In [None]:
i = 0
for c in string.punctuation:
  i += 1
  print(c, end='\t')
  if i == 4:
    i = 0
    print('\n')

!	"	#	$	

%	&	'	(	

)	*	+	,	

-	.	/	:	

;	<	=	>	

?	@	[	\	

]	^	_	`	

{	|	}	~	



In [None]:
print(remove_puncts("Hello, we're AIVN."))

Hello were AIVN


### Task 6: xóa stopwords

In [None]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

def remove_stopwords(text):
  return " ".join([w for w in text.split() if w not in list(stopwords.words("english"))])

print(remove_stopwords(input_str))

Hello, I'm member AI VIETNAM AIO 2022 courses.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Task 7: xóa các từ có tần suất xuất hiện nhiều nhất (phạm vi trong 1 đoạn văn bản)

In [None]:
def remove_freq_words(text, num_to_remove=2): 
  counter = {}
  for word in text.split():
    if word not in counter.keys():
      counter[word] = 1
    else:
      counter[word] += 1
  descending_counter = {key:value for key, value in sorted(counter.items(), reverse=True, key=lambda x: x[1])}
  return " ".join([w for w in text.split() if w not in list(descending_counter.keys())[:num_to_remove]])

print(remove_freq_words("hello hello a a a ai vietnam"))

ai vietnam


### Task 8: sửa lỗi chính tả

In [None]:
!pip install autocorrect -q

In [None]:
from autocorrect import Speller
autocorrect_spell = Speller(lang='en')
def word_correction(text):
  #segmented_str = str(sym_spell.word_segmentation(text).corrected_string)
  return " ".join([autocorrect_spell(w) for w in text.split()])
print(word_correction("hellox weclome to aivn"))

hello welcome to ain


### Task 9: stemming

In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stemming(text):
  return " ".join([stemmer.stem(w) for w in text.split()])

print(stemming("We are learning text preprocessing"))

we are learn text preprocess


### Task 10: lemmatization

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatize(text):
  doc = nlp(text)
  return " ".join([w.lemma_ for w in doc])

print(lemmatize("I hope you all the best"))

I hope you all the good


## TextPreprocessor

In [None]:
!pip install autocorrect bs4 -q

In [None]:
import string
import nltk
import spacy


nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from autocorrect import Speller
from bs4 import BeautifulSoup

class TextPreprocessor:
  def __init__(self, stopwords=stopwords, stemmer=PorterStemmer()):
    self.stopwords = stopwords
    self.stemmer = stemmer
    self.spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    self.autocorrect_spell = Speller(lang='en')

  def lowercasing(self, text):
    return text.lower() 

  def uppercasing(self, text):
    return text.upper() 

  def remove_puncts(self, text):
    return text.translate(text.maketrans("", "", string.punctuation))

  def remove_stopwords(self, text):
    return " ".join([w for w in text.split() if w not in list(self.stopwords.words("english"))])

  def remove_freq_words(self, text, num_to_remove=1): 
    counter = {}
    for word in text.split():
      if word not in counter.keys():
        counter[word] = 1
      else:
        counter[word] += 1
    descending_counter = {key:value for key, value in sorted(counter.items(), reverse=True, key=lambda x: x[1])}
    return " ".join([w for w in text.split() if w not in list(descending_counter.keys())[:num_to_remove]])

  def stemming(self, text):
    return " ".join([self.stemmer.stem(w) for w in text.split()])

  def lemmatizing(self, text):
    doc = self.spacy_nlp(text)
    return " ".join([token.lemma_ for token in doc])

  def remove_url(self, text):
    return " ".join([w for w in text.split() if not (w.startswith("http://") or w.startswith("https://"))])

  def remove_html_tags(self, text):
    soup = BeautifulSoup(text)
    return soup.get_text()
  
  def spelling_correction(self, text):
    return " ".join([self.autocorrect_spell(w) for w in text.split()])

  def preprocess_text(self, text, 
                         lowercase=True, 
                         uppercase=False, 
                         remove_punct=True, 
                         remove_stopwords=True, 
                         remove_freqs=True, 
                         stemming=True, 
                         lemmatize=False, 
                         remove_url=True, 
                         remove_html_tags=True,
                         spelling_correct=True):
    
    if lowercase == True and uppercase == True:
      print("Cannot do both lowercasing and uppercasing. Please specify one.")
      return
    if stemming == True and lemmatize == True:
      print("Cannot do both stemming and lemmatization. Please specify one.")
      return
    
    preprocessed_text = text
    if lowercase == True:
      preprocessed_text = self.lowercasing(preprocessed_text)
    if uppercase == True:
      preprocessed_text = self.uppercasing(preprocessed_text)
    if remove_url == True:
      preprocessed_text = self.remove_url(preprocessed_text)
    if remove_html_tags == True:
      preprocessed_text = self.remove_html_tags(preprocessed_text)
    if remove_punct == True:
      preprocessed_text = self.remove_puncts(preprocessed_text)
    if remove_stopwords == True:
      preprocessed_text = self.remove_stopwords(preprocessed_text)
    if remove_freqs == True:
      preprocessed_text = self.remove_freq_words(preprocessed_text)
    if spelling_correct == True:
      preprocessed_text = self.spelling_correction(preprocessed_text)
    if stemming == True:
      preprocessed_text = self.stemming(preprocessed_text)
    if lemmatize == True:
      preprocessed_text = self.lemmatizing(preprocessed_text)

    return preprocessed_text

preprocessor = TextPreprocessor()
text = """Hello AIO-2022, <p>we're AIVN. 
Follow us at: https://www.facebook.com/aivietnam.edu.vn
#aivn #aio2022"""
print(preprocessor.preprocess_text(text))
print(preprocessor.preprocess_text(text, remove_punct=False, remove_url=False, spelling_correct=False))
print(preprocessor.preprocess_text(text, lowercase=True, uppercase=True))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


hello ain follow us ain
aio-2022, we'r aivn. follow us at: https://www.facebook.com/aivietnam.edu.vn #aivn #aio2022
Cannot do both lowercasing and uppercasing. Please specify one.
None


# Problem 02: Text Vectorization

In [None]:
!gdown --id 12kMc6YatHC3eLOca53VCgbLwlQPhWqyt

Downloading...
From: https://drive.google.com/uc?id=12kMc6YatHC3eLOca53VCgbLwlQPhWqyt
To: /content/abstract_sample.txt
100% 1.13k/1.13k [00:00<00:00, 1.81MB/s]


### Task 1: Tokenization

In [None]:
with open("abstract_sample.txt") as f:
    data = f.read()
    data = data.strip()
    tokenized_data = data.split()

print(tokenized_data)

['We', 'present', 'YOLO,', 'a', 'new', 'approach', 'to', 'object', 'detection.', 'Prior', 'work', 'on', 'object', 'detection', 'repurposes', 'classifiers', 'to', 'per-', 'form', 'detection.', 'Instead,', 'we', 'frame', 'object', 'detection', 'as', 'a', 're-', 'gression', 'problem', 'to', 'spatially', 'separated', 'bounding', 'boxes', 'and', 'associated', 'class', 'probabilities.', 'A', 'single', 'neural', 'network', 'pre-', 'dicts', 'bounding', 'boxes', 'and', 'class', 'probabilities', 'directly', 'from', 'full', 'images', 'in', 'one', 'evaluation.', 'Since', 'the', 'whole', 'detection', 'pipeline', 'is', 'a', 'single', 'network,', 'it', 'can', 'be', 'optimized', 'end-to-end', 'directly', 'on', 'detection', 'performance.', 'Our', 'unified', 'architecture', 'is', 'extremely', 'fast.', 'Our', 'base', 'YOLO', 'model', 'processes', 'images', 'in', 'real-time', 'at', '45', 'frames', 'per', 'second.', 'A', 'smaller', 'version', 'of', 'the', 'network,', 'Fast', 'YOLO,', 'processes', 'an', 'as

### Task 2: Create dictionary

In [None]:
dictionary = []
for word in tokenized_data:
    if word not in dictionary:
        dictionary.append(word)

print(dictionary)

['We', 'present', 'YOLO,', 'a', 'new', 'approach', 'to', 'object', 'detection.', 'Prior', 'work', 'on', 'detection', 'repurposes', 'classifiers', 'per-', 'form', 'Instead,', 'we', 'frame', 'as', 're-', 'gression', 'problem', 'spatially', 'separated', 'bounding', 'boxes', 'and', 'associated', 'class', 'probabilities.', 'A', 'single', 'neural', 'network', 'pre-', 'dicts', 'probabilities', 'directly', 'from', 'full', 'images', 'in', 'one', 'evaluation.', 'Since', 'the', 'whole', 'pipeline', 'is', 'network,', 'it', 'can', 'be', 'optimized', 'end-to-end', 'performance.', 'Our', 'unified', 'architecture', 'extremely', 'fast.', 'base', 'YOLO', 'model', 'processes', 'real-time', 'at', '45', 'frames', 'per', 'second.', 'smaller', 'version', 'of', 'Fast', 'an', 'astounding', '155', 'second', 'while', 'still', 'achieving', 'double', 'mAP', 'other', 'detec-', 'tors.', 'Compared', 'state-of-the-art', 'systems,', 'makes', 'more', 'localization', 'errors', 'but', 'less', 'likely', 'predict', 'false',

### Task 3.1: Count vectorizer

In [None]:
def count_vectorizer(text, dictionary):
  vector = [0 for _ in range(len(dictionary))]
  tokens = text.split()
  for token in tokens:
    if token in dictionary:
      vector[dictionary.index(token)] += 1
  
  return vector

print(count_vectorizer("This is YOLO", dictionary))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Task 3.2: One-hot encoding

In [None]:
def one_hot_encoding(text, dictionary):
  vector = []
  tokens = text.split()
  for token in tokens:
    vector_one_hot = [0 for _ in range(len(dictionary))]
    if word in dictionary:
      vector_one_hot[dictionary.index(word)] = 1
    vector.append(vector_one_hot)
  
  return vector

print(one_hot_encoding("This is YOLO", dictionary))

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## TextVectorizer

In [None]:
class TextVectorizer:
  def __init__(self, dictionary=None):
    self.dictionary = dictionary
  
  def tokenize(self, text):
    return text.split(" ")
  
  def create_dictionary(self, corpus):
    dictionary = []
    for doc in corpus:
      tokens = self.tokenize(doc)
      for token in tokens:
        if token not in dictionary:
          dictionary.append(token)

    self.dictionary = dictionary
    return dictionary

  def one_hot_encoding(self, text):
    vocabs = self.dictionary
    tokens = self.tokenize(text)
    vector = []
    for token in tokens:
      sub_vector = [0 for _ in range(len(vocabs))]
      if token in vocabs:
        sub_vector[vocabs.index(token)] = 1
      vector.append(sub_vector)

    return vector
  
  def count_vectorizer(self, text):
    vocabs = self.dictionary
    vector = [0 for _ in range(len(vocabs))]
    tokens = self.tokenize(text)
    for token in tokens:
      if token in vocabs:
        vector[vocabs.index(token)] += 1
    
    return vector

vectorizer = TextVectorizer()
corpus = [
  "hello my name is robo",
  "this is the world of happiness",
  "welcome to machine learning",
  "please be aware of errors",
  "deep learning is fun",
  "welcome to aivn",
  "hope you all the best, aio2022"
]
vectorizer.create_dictionary(corpus)
text = "welcome aio2022"
print(vectorizer.count_vectorizer(text))

dictionary = ['hello', 'my', 'name', 'robo', 'world', 'welcome', 'happiness']
vectorizer = TextVectorizer(dictionary)
print(vectorizer.count_vectorizer(text))
print(vectorizer.one_hot_encoding(text))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 1, 0]
[[0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0]]


# Problem 03: Autocomplete

In [None]:
!gdown --id 16IiXNoRJxsRBfFIr1wacrULbQHGi_lEa
!unzip txt.zip

Downloading...
From: https://drive.google.com/uc?id=16IiXNoRJxsRBfFIr1wacrULbQHGi_lEa
To: /content/txt.zip
100% 276k/276k [00:00<00:00, 80.2MB/s]
Archive:  txt.zip
   creating: txt/
 extracting: txt/B01___24_Matthew_____ENGESVN1DA_verse_34.txt  
 extracting: txt/B01___25_Matthew_____ENGESVN1DA_verse_41.txt  
 extracting: txt/B01___25_Matthew_____ENGESVN1DA_verse_6.txt  
 extracting: txt/B01___25_Matthew_____ENGESVN1DA_verse_1.txt  
 extracting: txt/B01___13_Matthew_____ENGESVN1DA_verse_48.txt  
 extracting: txt/B01___07_Matthew_____ENGESVN1DA_verse_1.txt  
 extracting: txt/B01___05_Matthew_____ENGESVN1DA_verse_44.txt  
 extracting: txt/B01___25_Matthew_____ENGESVN1DA_verse_10.txt  
 extracting: txt/B01___07_Matthew_____ENGESVN1DA_verse_18.txt  
 extracting: txt/B01___25_Matthew_____ENGESVN1DA_verse_22.txt  
 extracting: txt/B01___24_Matthew_____ENGESVN1DA_verse_46.txt  
 extracting: txt/B01___25_Matthew_____ENGESVN1DA_verse_24.txt  
 extracting: txt/B01___25_Matthew_____ENGESVN1DA_vers

In [None]:
import os
import string

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

def remove_stopwords(text):
  return " ".join([w for w in text.split() if w not in list(stopwords.words("english"))])

def remove_punctuation(text):
  return text.translate(text.maketrans("", "", string.punctuation))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def create_dictionary(corpus_path='./txt'):
  dictionary = []
  for filename in os.listdir(corpus_path):
    file_path = os.path.join(corpus_path, filename)
    f = open(file_path)
    text = f.read()
    text = text.strip()
    text = remove_punctuation(text)
    text = remove_stopwords(text)

    words = text.split()

    for word in words:
      if word not in dictionary:
        dictionary.append(word)

  return dictionary
      
print("dictionary size: ", len(create_dictionary()))
print(create_dictionary())

dictionary size:  2159
['Then', 'high', 'priest', 'tore', 'robes', 'said', 'He', 'uttered', 'blasphemy', 'What', 'witnesses', 'need', 'You', 'heard', 'And', 'Capernaum', 'exalted', 'heaven', 'brought', 'Hades', 'For', 'mighty', 'works', 'done', 'Sodom', 'would', 'remained', 'day', 'deportation', 'Babylon', 'Jechoniah', 'father', 'Shealtiel', 'Zerubbabel', 'The', 'one', 'receives', 'prophet', 'receive', 'reward', 'righteous', 'person', 'It', 'also', '‘Whoever', 'divorces', 'wife', 'let', 'give', 'certificate', 'divorce', 'Jesus', 'I', 'say', 'seven', 'times', 'seventy', 'Matthew', '26', 'answered', 'Every', 'plant', 'heavenly', 'Father', 'planted', 'rooted', 'But', 'tenants', 'saw', 'son', '‘This', 'heir', 'Come', 'us', 'kill', 'inheritance', 'When', 'entered', 'house', 'blind', 'men', 'came', 'Do', 'believe', 'able', 'They', 'Yes', 'Lord', 'Now', 'John', 'prison', 'deeds', 'Christ', 'sent', 'word', 'disciples', 'Thus', 'witness', 'sons', 'murdered', 'prophets', 'seized', 'led', 'Caiaph

In [None]:
def problem03(text):
  dictionary = create_dictionary()
  i = 0
  print("Suggest words:")
  for w in dictionary:
    if w.lower().startswith(text.lower()):
      print(f"{i + 1}. {w}")
      i += 1

In [None]:
text = "hel"
problem03(text)

Suggest words:
1. held
2. helpless
3. hell
4. help
