diff --git a/.gitignore b/.gitignore index 7dd29ef86..2c6667640 100644 --- a/.gitignore +++ b/.gitignore @@ -82,4 +82,4 @@ docs/_build/doctrees/api/ docs/_build/html/ -docs/_build/doctrees/ +docs/_build/doctrees/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index d344703c4..7bc977447 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,10 +15,9 @@ script: coverage run --source=pythainlp setup.py test after_success: coveralls deploy: provider: pypi - distributions: sdist bdist_wheel + distributions: bdist_wheel user: wannaphong password: - secure: zX35+8niw5W9H8XbFwacrDAhqyIibdUdC/cARnHlmxLN/2H9IynK0NW04UZwkBlrwrIZrU/g+cqYXFQXu6jE1ozlBKBxUd3xG8d1kixuntI0j9e+erPTs8Ju/KazUZtlknJPvnDMP+/1Dq+RMnMCP3RRlBrH6lvG70OgZ1aBpgx8FxRfs0xHfBIZvo5CVtR/QlDzhDJM1cgEyWkSgnlAhPxpv8qIQbh4/Rw89jXIZqv0bGCVJorrrcTA1oCzkr/4E4u/WZaARnvPjUr2a9U1w7C2IysDHiBfqQWlovdMmpoSLFE56YlG3smbmXfldWjmiMRQoWL+Ifu+smisvOLmR0ja78UMrrhHWP4mdzIeBVVRnT6eHUv0ChmLT2uCkOLE0newhtEJIYToot2TSoLFavXXIQB1fIHt6e74KRTV6WGnm0nFfHuGP+b5SgSPQFgqx8tBpn0rBOeqZ1y3pRISc/drF0F4reWMnlqoQfZZFmLmU1UmDZbvWNvXPu6MWyyuZ1F6fE9jyb3mG+kDuJf1PZ4ejC/sdIvpLlwUGLFGzRMa2TtxXqGq5CWsywPxo8Sx+bpMPCOImuW60PB9K/xKgfLhAtb7gZwndzUGqDbtSJCd5PmTkfEH8fawv/XnydvsssYUpipBCmFDZlNREyAkgOcLlL099Y5fAO8l2gOLyKs= + secure: Tj3VA05qopp0mkzWu6EFTlvijAoisd0BN/gD2c/vaaDCUy6fTXBkYk+dTkjbmYkEBl/WrsrW1T/QxCt2uc6bv7QTz+qL243Edv4FFQbBKvMSNlUO+hh1jI9zv3/QzwOaNHXOsI4JGeUaN5cULfxBjsBEFN+v6E0mkgBwJ0Qdb0/yuMybLWZ9dJI8iUKiaWNIr+NQoa9a+Sxw6Ltl/mdCKPppgOYPpVMCsDDdLqZdjkgXmzsjH9+Nfe6R+mYbdmeigy3ePNsIKbPkzZrY+E/I0lPZOVUgrs6gvZwlD3gESJgTROrUH6E2lBP9yYvFUE3KB0O+rdT5GyFq3MH1uD2ocrPCTQku6577wK31FzGoex6vtT4y2b39fLbhRmZDOJW8IFO7MLybazuRsNhaXn9hQU4HBRM2GQZc41bLkiEhsUX9/b2ujcn4PJKDZy91LnBw/93bgZJ7KweDzKywmcZSNeuBsGWgXdPqYiizzcf8DdvJAYytydhf8RxqdemTiS7GE7XBoXhj1/9Vfrt3lZXZbfYpTjNZeyxu7FrUJpm/I23wCw46qaRWzKXv2sRRUleNqQ1jIKEVupIa9sruHvG7DZecErhO9rMkGdsf4CIjolZ0A2BE+eAPEEY6/H1WFUWHxzxuELbUJwxnl1By677hBkLJaVs1YMGc2enGWzOnUYI= on: - tags: true - repo: pythainlp/pythainlp \ No newline at end of file + tags: true \ No newline at end of file diff --git a/README-pypi.md b/README-pypi.md index 51ad0707b..b39e99b59 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -8,20 +8,15 @@ PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, pa 📫 follow us on Facebook [PyThaiNLP](https://www.facebook.com/pythainlp/) -## What's new in 2.0 ? +## What's new in 2.1 ? -- Terminate Python 2 support. Remove all Python 2 compatibility code. - Improved `word_tokenize` ("newmm" and "mm" engine), a `custom_dict` dictionary can be provided -- Improved `pos_tag` Part-Of-Speech tagging -- New `NorvigSpellChecker` spell checker class, which can be initialized with custom dictionary. -- New `thai2fit` (replacing `thai2vec`, upgrade ULMFiT-related code to fastai 1.0) -- Updated ThaiNER to 1.0 - - You may need to [update your existing ThaiNER models from PyThaiNLP 1.7](https://github.com/PyThaiNLP/pythainlp/wiki/Upgrade-ThaiNER-from-PyThaiNLP-1.7-to-PyThaiNLP-2.0) -- Remove old, obsolated, deprecated, duplicated, and experimental code. - - Sentiment analysis is no longer part of the library, but rather [a text classification example](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/sentiment_analysis.ipynb). +- Add AttaCut to be options for `word_tokenize` engine. +- New Thai2rom (PyTorch) +- New Command Line +- Add word tokenization benchmark to PyThaiNLP - See more examples in [Get Started notebook](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb) -- [Full change log](https://github.com/PyThaiNLP/pythainlp/issues/118) -- [Upgrading from 1.7](https://thainlp.org/pythainlp/docs/2.0/notes/pythainlp-1_7-2_0.html) +- [Full change log](https://github.com/PyThaiNLP/pythainlp/issues/181) ## Install @@ -40,6 +35,7 @@ pip install pythainlp[extra1,extra2,...] where extras can be - `artagger` (to support artagger part-of-speech tagger)* +- `attacut` - Wrapper for AttaCut (https://github.com/PyThaiNLP/attacut) - `deepcut` (to support deepcut machine-learnt tokenizer) - `icu` (for ICU support in transliteration and tokenization) - `ipa` (for International Phonetic Alphabet support in transliteration) @@ -54,8 +50,15 @@ Install it with pip, for example: `pip install marisa_trie‑0.7.5‑cp36‑cp36 ## Links -- User guide: [English](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb), [ภาษาไทย](https://colab.research.google.com/drive/1rEkB2Dcr1UAKPqz4bCghZV7pXx2qxf89) -- Docs: https://thainlp.org/pythainlp/docs/2.0/ +- User guide: [English](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb) +- Docs: https://thainlp.org/pythainlp/docs/2.1/ - GitHub: https://github.com/PyThaiNLP/pythainlp - Issues: https://github.com/PyThaiNLP/pythainlp/issues - Facebook: [PyThaiNLP](https://www.facebook.com/pythainlp/) + + +Made with ❤️ + +We build Thai NLP. + +PyThaiNLP Team. \ No newline at end of file diff --git a/README.md b/README.md index 9a814a5ea..fab714986 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ Made with ❤️ We build Thai NLP. -PyThaiNLP team. +PyThaiNLP Team. # ภาษาไทย diff --git a/appveyor.docs.yml b/appveyor.docs.yml new file mode 100644 index 000000000..12f512c06 --- /dev/null +++ b/appveyor.docs.yml @@ -0,0 +1,62 @@ +image: ubuntu1604 + +branches: + only: + - /2.*/ + - dev + +skip_commits: + message: /(skip ci docs)/ # Skip a new build if message contains '(skip ci docs)' + +install: + - sudo add-apt-repository ppa:jonathonf/python-3.6 -y + - sudo apt-get update + - sudo apt install -y python3.6 + - sudo apt install -y python3.6-dev + - sudo apt install -y python3.6-venv + - wget https://bootstrap.pypa.io/get-pip.py + - sudo python3.6 get-pip.py + - sudo ln -s /usr/bin/python3.6 /usr/local/bin/python + - sudo apt-get install -y pandoc libicu-dev + - python -V + - python3 -V + - pip -V + - sudo pip install -r requirements.txt + - export LD_LIBRARY_PATH=/usr/local/lib + - sudo pip install torch==1.2.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + - sudo pip install sphinx sphinx-rtd-theme typing artagger deepcut epitran keras numpy pyicu sklearn-crfsuite tensorflow ssg emoji pandas + - sudo pip install --upgrade gensim smart_open boto + +# configuration for deploy mode, commit message with /(build and deloy docs)/ +# 1. build documents and upload HTML files to Appveyor's storage +# 2. upload to thainlp.org/pythainlp/docs/ + +only_commits: + message: /(build and deploy docs)/ + +build_script: + - cd ./docs + - export CURRENT_BRANCH=$APPVEYOR_REPO_BRANCH + - export RELEASE=$(git describe --tags --always) + - export RELEASE=$(echo $RELEASE | cut -d'-' -f1) + - export TODAY=$(date +'%Y-%m-%d') + - make html + - echo "Done building HTML files for the branch -- $APPVEYOR_REPO_BRANCH" + - echo "Start cleaning the directory /docs/$APPVEYOR_REPO_BRANCH" + - sudo bash ./clean_directory.sh $FTP_USER $FTP_PASSWORD $FTP_HOST $APPVEYOR_REPO_BRANCH + - echo "Start Uploading files to thainlp.org/pythainlp/docs/$APPVEYOR_REPO_BRANCH" + - cd ./_build/html + - echo "cd to ./build/html" + - find . -type f -name "*" -print -exec curl --ftp-create-dir --ipv4 -T {} ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_HOST}/public_html/pythainlp/docs/$APPVEYOR_REPO_BRANCH/{} \; + - echo "Done uploading" + - echo "Done uploading files to -- thainlp.org/pythainlp/docs/$APPVEYOR_REPO_BRANCH" + +artifacts: + - path: ./docs/_build/html + name: document + +after_build: + - echo "Done build and deploy" + - appveyor exit + +test: off diff --git a/bin/word-tokenization-benchmark b/bin/word-tokenization-benchmark index 0193926d7..a692d3db2 100644 --- a/bin/word-tokenization-benchmark +++ b/bin/word-tokenization-benchmark @@ -1,121 +1,115 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +import argparse import json import os -import argparse -import yaml -from pythainlp.benchmarks import word_tokenisation +import yaml +from pythainlp.benchmarks import word_tokenization parser = argparse.ArgumentParser( description="Script for benchmarking tokenizaiton results" ) parser.add_argument( - "--input", + "--input-file", action="store", - help="path to file that you want to compare against the test file" + help="Path to input file to compare against the test file", ) parser.add_argument( "--test-file", action="store", - help="path to test file" + help="Path to test file i.e. ground truth", ) parser.add_argument( "--save-details", default=False, - action='store_true', - help="specify whether to save the details of comparisons" + action="store_true", + help="Save comparison details to files (eval-XXX.json and eval-details-XXX.json)", ) args = parser.parse_args() + def _read_file(path): with open(path, "r", encoding="utf-8") as f: lines = map(lambda r: r.strip(), f.readlines()) return list(lines) -print(args.input) -actual = _read_file(args.input) +print(args.input_file) +actual = _read_file(args.input_file) expected = _read_file(args.test_file) -assert len(actual) == len(expected), \ - 'Input and test files do not have the same number of samples' -print('Benchmarking %s against %s with %d samples in total' % ( - args.input, args.test_file, len(actual) -)) - -df_raw = word_tokenisation.benchmark(expected, actual) - -df_res = df_raw\ - .describe() -df_res = df_res[[ - 'char_level:tp', - 'char_level:tn', - 'char_level:fp', - 'char_level:fn', - 'char_level:precision', - 'char_level:recall', - 'char_level:f1', - 'word_level:precision', - 'word_level:recall', - 'word_level:f1', -]] +assert len(actual) == len( + expected +), "Input and test files do not have the same number of samples" +print( + "Benchmarking %s against %s with %d samples in total" + % (args.input_file, args.test_file, len(actual)) +) + +df_raw = word_tokenization.benchmark(expected, actual) + +df_res = df_raw.describe() +df_res = df_res[ + [ + "char_level:tp", + "char_level:tn", + "char_level:fp", + "char_level:fn", + "char_level:precision", + "char_level:recall", + "char_level:f1", + "word_level:precision", + "word_level:recall", + "word_level:f1", + ] +] df_res = df_res.T.reset_index(0) -df_res['mean±std'] = df_res.apply( - lambda r: '%2.2f±%2.2f' % (r['mean'], r['std']), - axis=1 +df_res["mean±std"] = df_res.apply( + lambda r: "%2.2f±%2.2f" % (r["mean"], r["std"]), axis=1 ) -df_res['metric'] = df_res['index'] +df_res["metric"] = df_res["index"] print("============== Benchmark Result ==============") -print(df_res[['metric', 'mean±std', 'min', 'max']].to_string(index=False)) - +print(df_res[["metric", "mean±std", "min", "max"]].to_string(index=False)) if args.save_details: data = {} - for r in df_res.to_dict('records'): - metric = r['index'] - del r['index'] + for r in df_res.to_dict("records"): + metric = r["index"] + del r["index"] data[metric] = r - dir_name = os.path.dirname(args.input) - file_name = args.input.split("/")[-1].split(".")[0] + dir_name = os.path.dirname(args.input_file) + file_name = args.input_file.split("/")[-1].split(".")[0] res_path = "%s/eval-%s.yml" % (dir_name, file_name) print("Evaluation result is saved to %s" % res_path) - with open(res_path, 'w') as outfile: + with open(res_path, "w", encoding="utf-8") as outfile: yaml.dump(data, outfile, default_flow_style=False) res_path = "%s/eval-details-%s.json" % (dir_name, file_name) print("Details of comparisons is saved to %s" % res_path) - with open(res_path, "w") as f: + with open(res_path, "w", encoding="utf-8") as f: samples = [] for i, r in enumerate(df_raw.to_dict("records")): expected, actual = r["expected"], r["actual"] del r["expected"] del r["actual"] - samples.append(dict( - metrics=r, - expected=expected, - actual=actual, - id=i - )) - - details = dict( - metrics=data, - samples=samples - ) + samples.append(dict(metrics=r, expected=expected, actual=actual, id=i)) + + details = dict(metrics=data, samples=samples) json.dump(details, f, ensure_ascii=False) diff --git a/bld.bat b/bld.bat deleted file mode 100644 index f1f6e67a7..000000000 --- a/bld.bat +++ /dev/null @@ -1,2 +0,0 @@ -cd %RECIPE_DIR%\.. -%PYTHON% setup.py install --single-version-externally-managed --record=record.txt diff --git a/build.sh b/build.sh deleted file mode 100644 index 838a79040..000000000 --- a/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -cd "$RECIPE_DIR"/.. || exit -$PYTHON setup.py install --single-version-externally-managed --record=record.txt diff --git a/buildall.sh b/buildall.sh deleted file mode 100755 index 104591b14..000000000 --- a/buildall.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -# change the package name to the existing PyPi package you would like to build and adjust the Python versions -pkg='pythainlp' -array=( 3.5 3.6 3.7 ) - -echo "Building conda package ..." -cd ~ -conda skeleton pypi $pkg -cd $pkg -wget https://raw.githubusercontent.com/Anaconda-Platform/anaconda-client/master/conda.recipe/build.sh -wget https://raw.githubusercontent.com/Anaconda-Platform/anaconda-client/master/conda.recipe/bld.bat -cd ~ - -# building conda packages -for i in "${array[@]}" -do - conda-build --python $i $pkg -done - -# convert package to other platforms -cd ~ -platforms=( osx-64 linux-32 linux-64 win-32 win-64 ) -find $HOME/conda-bld/linux-64/ -name *.tar.bz2 | while read file -do - echo $file - #conda convert --platform all $file -o $HOME/conda-bld/ - for platform in "${platforms[@]}" - do - conda convert --platform $platform $file -o $HOME/conda-bld/ - done - -done - -# upload packages to conda -find $HOME/conda-bld/ -name *.tar.bz2 | while read file -do - echo $file - anaconda upload $file -done - -echo "Building conda package done!" diff --git a/docs/api/benchmarks.rst b/docs/api/benchmarks.rst index e5167ee96..26ee1a1b5 100644 --- a/docs/api/benchmarks.rst +++ b/docs/api/benchmarks.rst @@ -19,6 +19,6 @@ Quality Qualitative evaluation of word tokenization. -.. autofunction:: pythainlp.benchmarks.word_tokenisation.compute_stats -.. autofunction:: pythainlp.benchmarks.word_tokenisation.benchmark -.. autofunction:: pythainlp.benchmarks.word_tokenisation.preprocessing +.. autofunction:: pythainlp.benchmarks.word_tokenization.compute_stats +.. autofunction:: pythainlp.benchmarks.word_tokenization.benchmark +.. autofunction:: pythainlp.benchmarks.word_tokenization.preprocessing diff --git a/docs/api/ulmfit.rst b/docs/api/ulmfit.rst index 527336c64..9b7bdba28 100644 --- a/docs/api/ulmfit.rst +++ b/docs/api/ulmfit.rst @@ -6,13 +6,21 @@ The :class:`ulmfit.utils` is utils for ULMFit model. Modules ------- - -.. autofunction:: replace_rep_after +.. autoclass:: ThaiTokenizer +.. autofunction:: document_vector +.. autofunction:: process_thai +.. autofunction:: fix_html +.. autofunction:: spec_add_spaces +.. autofunction:: rm_useless_spaces .. autofunction:: rm_useless_newlines .. autofunction:: rm_brackets +.. autofunction:: replace_rep_nonum .. autofunction:: ungroup_emoji .. autofunction:: lowercase_all +.. autofunction:: replace_wrep_post +.. autofunction:: replace_wrep_post_nonum +.. autofunction:: replace_rep_after +.. autofunction:: remove_space .. autofunction:: merge_wgts -.. autofunction:: document_vector -.. autoclass:: ThaiTokenizer - :members: tokenizer + +:members: tokenizer diff --git a/docs/clean_directory.sh b/docs/clean_directory.sh new file mode 100644 index 000000000..126ee4774 --- /dev/null +++ b/docs/clean_directory.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Delete all files and folders in the directory: /pythainlp/docs/ + +# $1 : FTP_USER +# $2 : FTP_PASSWORD +# $3 : FTP_HOST +# $4 : Brnach name + +FTP_USER=$1 +FTP_PASSWORD=$2 +FTP_HOST=$3 +BRANCH_NAME=$4 + +remove_all_files() +{ + # DIRECTORY=$1 + echo "delete files in: $1" + for f in `curl --list-only --ftp-create-dirs --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST/$1/`; do + if [[ -d "$f" ]] || [[ "$f" = _* ]] || [[ "$f" = .doctree ]] || [[ "$f" != *"."* ]]; then + echo "--- deleting files in folder: $1/$f"; + remove_all_files $1/$f + else + echo "delete a file: $f" + curl --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST -Q "DELE $1/$f" + fi + done +} + +remove_empty_folders() +{ + + echo "delete empty folders in: $1" + for f in `curl --list-only --ftp-create-dirs --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST/$1/`; do + if [[ -d "$f" ]] || [[ "$f" = _* ]] || [[ "$f" = fonts ]] || [[ "$f" = pythainlp ]] || [[ "$f" = .doctree ]] || [[ "$f" != *"."* ]]; then + echo "--- deleting folders in: $1/$f"; + remove_empty_folders $1/$f + curl --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST -Q "RMD $1/$f" + else + echo "delete a folder: $f" + curl --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST -Q "RMD $1/$f" + fi + done +} + +echo "Start removing all files within 'public_html/pythainlp/docs/$BRANCH_NAME/'"; + +remove_all_files public_html/pythainlp/docs/$BRANCH_NAME; + +echo "Start removing all empty folders within 'public_html/pythainlp/docs/$BRANCH_NAME/'"; + +remove_empty_folders public_html/pythainlp/docs/$BRANCH_NAME; + +echo "Done"; \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 9a9e47343..a43b593a0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,6 +11,7 @@ # import os import sys +import traceback sys.path.insert(0, os.path.abspath('..')) from datetime import datetime @@ -23,10 +24,24 @@ curyear = datetime.today().year copyright = u'2017-%s, %s (Apache Software License 2.0)' % (curyear, project) +# -- Get version information and date from Git ---------------------------- + +try: + from subprocess import check_output, STDOUT + current_branch = os.environ["CURRENT_BRANCH"] if "CURRENT_BRANCH" in os.environ else check_output(['git', 'symbolic-ref', 'HEAD'], shell=False, stderr=STDOUT).decode().strip().split('/')[-1] + release = os.environ["RELEASE"] if "RELEASE" in os.environ else check_output(['git', 'describe', '--tags', '--always'], shell=False, stderr=STDOUT).decode().strip().split('-')[0] + today = os.environ["TODAY"] if "TODAY" in os.environ else check_output(['git', 'show', '-s', '--format=%ad', '--date=short'], shell=False, stderr=STDOUT).decode().strip() +except Exception as e: + traceback.print_exc() + release = '' + today = '' + current_branch = '' + # The short X.Y version -version = '2.0' +version = '{} ({})
Published date: {}'.format(current_branch, release, today) + # The full version, including alpha/beta/rc tags -release = '2.0.3' +release = release # -- General configuration --------------------------------------------------- @@ -88,7 +103,9 @@ # further. For a list of options available for each theme, see the # documentation. # -# html_theme_options = {} +html_theme_options = { + 'display_version': True, +} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/images/evaluation.png b/docs/images/evaluation.png new file mode 100644 index 000000000..3ac1cd13e Binary files /dev/null and b/docs/images/evaluation.png differ diff --git a/docs/notes/command_line.rst b/docs/notes/command_line.rst index 90211ccc6..329ba6c78 100644 --- a/docs/notes/command_line.rst +++ b/docs/notes/command_line.rst @@ -7,27 +7,45 @@ Run Command Line and type the commands::: pythainlp -**Word segment**:: +**Word tokenization**:: - pythainlp -t TEXT -seg + pythainlp tokenization word --text TEXT *Example*:: - $ pythainlp -t แมวกินปลา -seg - แมว|กิน|ปลา + $ pythainlp tokenization word --text "ผมร<0e31>กประเทศไทย? สามารถ" --engine newmm + ผม|รัก|ประเทศไทย|?| |สามารถ -**Postag**:: - pythainlp -t TEXT -pos +**Syllable tokenization**:: + + pythainlp tokenization syllable --text TEXT + +*Example*:: + + $ pythainlp tokenization syllable --text "ผมร<0e31>กประเทศไทย? สามารถ" + ผม~รัก~ประ~เทศ~ไทย~? ~สา~มารถ + +**Part-Of-Speech tagging**:: + + pythainlp tagging pos --text TEXT + +*Example*:: + + $ pythainlp tagging pos --text "ผม|ไม่|กิน|เผ็ด" + +**Soundex**:: + + pythainlp soundex --text TEXT *Example*:: - $ pythainlp -t แมวกินปลา -pos - แมว/NCMN กิน/VACT ปลา/NCMN + $ pythainlp soundex --text "บ<0e39>รณการ" --engine lk82 + บE419 **Mange corpus**:: - pythainlp -c + pythainlp corpus **Help**:: diff --git a/notebooks/sentiment_analysis.ipynb b/notebooks/sentiment_analysis.ipynb index c1af928cb..74c9d4f9e 100644 --- a/notebooks/sentiment_analysis.ipynb +++ b/notebooks/sentiment_analysis.ipynb @@ -1,1805 +1,1838 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8AF08ssF6z5K" + }, + "source": [ + "# Sentiment Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "o-tHtMyc6z5L" + }, + "source": [ + "This notebook details the steps taken to create a sentiment analyzer using data from [WISESIGHT Sentiment Analysis](https://www.kaggle.com/c/wisesight-sentiment/) competition. Competition metric is overall accuracy across `neg`ative, `pos`itive, `neu`tral and `q`uestion classes. We give examples using logistic regression and ULMFit.\n", + "\n", + "The results for logistic regression, FastText, ULMFit, ULMFit with semi-supervised data are as follows:\n", + "\n", + "| Model | Public Accuracy | Private Accuracy |\n", + "|---------------------|-----------------|------------------|\n", + "| Logistic Regression | 0.72781 | 0.7499 |\n", + "| FastText | 0.63144 | 0.6131 |\n", + "| ULMFit | 0.71259 | 0.74194 |\n", + "| ULMFit Semi-supervised | 0.73119 | 0.75859 |\n", + "| ULMFit Semi-supervised Repeated One Time | **0.73372** | **0.75968** |\n", + "\n", + "For more information about the competition, see [1st Place Solution](https://www.kaggle.com/c/wisesight-sentiment/discussion/83564)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { "colab": { - "name": "sentiment_analysis.ipynb", - "version": "0.3.2", - "provenance": [] + "base_uri": "https://localhost:8080/", + "height": 1000 }, - "accelerator": "GPU" + "colab_type": "code", + "id": "bNjkuQK46z5M", + "outputId": "e04e1073-d8ca-4bd2-ac1c-2240826a75dc" + }, + "outputs": [], + "source": [ + "# #uncomment if you are running from google colab\n", + "# !pip install sklearn_crfsuite\n", + "# !pip install emoji\n", + "# !pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "# !pip install fastai\n", + "# !wget https://github.com/PyThaiNLP/wisesight-sentiment/archive/master.zip\n", + "# !unzip master.zip\n", + "# !mkdir wisesight_data; ls\n", + "# !cd wisesight-sentiment-master/kaggle-competition; ls" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "8AF08ssF6z5K", - "colab_type": "text" - }, - "source": [ - "# Sentiment Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "o-tHtMyc6z5L", - "colab_type": "text" - }, - "source": [ - "This notebook details the steps taken to create a sentiment analyzer using data from [WISESIGHT Sentiment Analysis](https://www.kaggle.com/c/wisesight-sentiment/) competition. Competition metric is overall accuracy across `neg`ative, `pos`itive, `neu`tral and `q`uestion classes. We give examples using logistic regression and ULMFit.\n", - "\n", - "The results for logistic regression, FastText, ULMFit, ULMFit with semi-supervised data are as follows:\n", - "\n", - "| Model | Public Accuracy | Private Accuracy |\n", - "|---------------------|-----------------|------------------|\n", - "| Logistic Regression | 0.72781 | 0.7499 |\n", - "| FastText | 0.63144 | 0.6131 |\n", - "| ULMFit | 0.71259 | 0.74194 |\n", - "| ULMFit Semi-supervised | 0.73119 | 0.75859 |\n", - "| ULMFit Semi-supervised Repeated One Time | **0.73372** | **0.75968** |\n", - "\n", - "For more information about the competition, see [1st Place Solution](https://www.kaggle.com/c/wisesight-sentiment/discussion/83564)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "bNjkuQK46z5M", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "outputId": "e04e1073-d8ca-4bd2-ac1c-2240826a75dc" - }, - "source": [ - "#uncomment if you are running from google colab\n", - "!pip install sklearn_crfsuite\n", - "!pip install emoji\n", - "!pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", - "!pip install fastai\n", - "!wget https://github.com/PyThaiNLP/wisesight-sentiment/archive/master.zip\n", - "!unzip master.zip\n", - "!mkdir wisesight_data; ls" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collecting sklearn_crfsuite\n", - " Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl\n", - "Requirement already satisfied: tabulate in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (0.8.3)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (1.12.0)\n", - "Requirement already satisfied: tqdm>=2.0 in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (4.28.1)\n", - "Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)\n", - "\u001b[K |████████████████████████████████| 757kB 3.8MB/s \n", - "\u001b[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite\n", - "Successfully installed python-crfsuite-0.9.6 sklearn-crfsuite-0.3.6\n", - "Collecting emoji\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/1b/d7/2746b4dd67375ce253e777ba54869545d24d2b0249ebcf83735c99df68d5/emoji-0.5.3.tar.gz (43kB)\n", - "\u001b[K |████████████████████████████████| 51kB 2.0MB/s \n", - "\u001b[?25hBuilding wheels for collected packages: emoji\n", - " Building wheel for emoji (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for emoji: filename=emoji-0.5.3-cp36-none-any.whl size=42175 sha256=208a827318503334b8daef569a30c9a2bf390eb8da0c6e8326ada209ae4708cf\n", - " Stored in directory: /root/.cache/pip/wheels/86/09/26/f944015841423cd516e8a97f30e29be59e53461aea8b7d3458\n", - "Successfully built emoji\n", - "Installing collected packages: emoji\n", - "Successfully installed emoji-0.5.3\n", - "Collecting https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", - "\u001b[?25l Downloading https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", - "\u001b[K | 15.7MB 322kB/s\n", - "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (0.3.0)\n", - "Collecting marisa-trie==0.7.4 (from pythainlp==2.1.dev2)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/1b/5f/21295ebb1feb1abde1e7652c0a4c182b4c25bdd5dda5a0f5b34d4e88bcc3/marisa_trie-0.7.4-cp36-cp36m-manylinux1_x86_64.whl (870kB)\n", - "\u001b[K |████████████████████████████████| 880kB 2.7MB/s \n", - "\u001b[?25hRequirement already satisfied: nltk>=3.2.2 in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (3.2.5)\n", - "Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (2018.9)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (2.21.0)\n", - "Collecting tinydb (from pythainlp==2.1.dev2)\n", - " Downloading https://files.pythonhosted.org/packages/d7/f9/0e871cbf0da678cf1780609dc6aef26a5ed544c86733fc1ceaf134fce52c/tinydb-3.13.0-py2.py3-none-any.whl\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (4.28.1)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk>=3.2.2->pythainlp==2.1.dev2) (1.12.0)\n", - "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (1.24.3)\n", - "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (2.8)\n", - "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (3.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (2019.6.16)\n", - "Building wheels for collected packages: pythainlp\n", - " Building wheel for pythainlp (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for pythainlp: filename=pythainlp-2.1.dev2-cp36-none-any.whl size=11014043 sha256=edfa71c88f221b4c0428a99429b522cdd69c512db0dbe00ed8910da6114e2c44\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-foefq95n/wheels/79/4e/1e/26f3198c6712ecfbee92928ed1dde923a078da3d222401cc78\n", - "Successfully built pythainlp\n", - "Installing collected packages: marisa-trie, tinydb, pythainlp\n", - "Successfully installed marisa-trie-0.7.4 pythainlp-2.1.dev2 tinydb-3.13.0\n", - "Collecting fastai==1.0.45\n", - "\u001b[31m ERROR: Could not find a version that satisfies the requirement fastai==1.0.45 (from versions: 0.6, 0.7.0, 1.0.0b7, 1.0.0b8, 1.0.0, 1.0.1, 1.0.2, 1.0.3, 1.0.4, 1.0.5, 1.0.6, 1.0.7, 1.0.9, 1.0.10, 1.0.11, 1.0.12, 1.0.13, 1.0.14, 1.0.15, 1.0.16, 1.0.17, 1.0.18, 1.0.19, 1.0.20, 1.0.21, 1.0.22, 1.0.24, 1.0.25, 1.0.26, 1.0.27, 1.0.28, 1.0.29, 1.0.30, 1.0.31, 1.0.32, 1.0.33, 1.0.34, 1.0.35, 1.0.36, 1.0.36.post1, 1.0.37, 1.0.38, 1.0.39, 1.0.40, 1.0.41, 1.0.42, 1.0.43.post1, 1.0.44, 1.0.46, 1.0.47, 1.0.47.post1, 1.0.48, 1.0.49, 1.0.50, 1.0.50.post1, 1.0.51, 1.0.52, 1.0.53, 1.0.53.post1, 1.0.53.post2, 1.0.53.post3, 1.0.54, 1.0.55, 1.0.57)\u001b[0m\n", - "\u001b[31mERROR: No matching distribution found for fastai==1.0.45\u001b[0m\n", - "--2019-08-20 07:49:28-- https://github.com/PyThaiNLP/wisesight-sentiment/archive/master.zip\n", - "Resolving github.com (github.com)... 192.30.253.112\n", - "Connecting to github.com (github.com)|192.30.253.112|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://codeload.github.com/PyThaiNLP/wisesight-sentiment/zip/master [following]\n", - "--2019-08-20 07:49:29-- https://codeload.github.com/PyThaiNLP/wisesight-sentiment/zip/master\n", - "Resolving codeload.github.com (codeload.github.com)... 140.82.113.9\n", - "Connecting to codeload.github.com (codeload.github.com)|140.82.113.9|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: unspecified [application/zip]\n", - "Saving to: ‘master.zip’\n", - "\n", - "master.zip [ <=> ] 3.95M 2.65MB/s in 1.5s \n", - "\n", - "2019-08-20 07:49:31 (2.65 MB/s) - ‘master.zip’ saved [4137118]\n", - "\n", - "Archive: master.zip\n", - "a2b5c41957bc799fe61cd407e7dc191ca6122dcb\n", - " creating: wisesight-sentiment-master/\n", - " inflating: wisesight-sentiment-master/.gitignore \n", - " inflating: wisesight-sentiment-master/README.md \n", - " inflating: wisesight-sentiment-master/exploration.ipynb \n", - " creating: wisesight-sentiment-master/kaggle-competition/\n", - " inflating: wisesight-sentiment-master/kaggle-competition/README.md \n", - " inflating: wisesight-sentiment-master/kaggle-competition/competition.ipynb \n", - " inflating: wisesight-sentiment-master/kaggle-competition/test.txt \n", - " inflating: wisesight-sentiment-master/kaggle-competition/test_label.txt \n", - " inflating: wisesight-sentiment-master/kaggle-competition/test_majority.csv \n", - " inflating: wisesight-sentiment-master/kaggle-competition/test_solution.csv \n", - " inflating: wisesight-sentiment-master/kaggle-competition/text_generation.ipynb \n", - " inflating: wisesight-sentiment-master/kaggle-competition/train.txt \n", - " inflating: wisesight-sentiment-master/kaggle-competition/train_label.txt \n", - " inflating: wisesight-sentiment-master/kaggle-competition/train_model.py \n", - " inflating: wisesight-sentiment-master/neg.txt \n", - " inflating: wisesight-sentiment-master/neu.txt \n", - " inflating: wisesight-sentiment-master/pos.txt \n", - " inflating: wisesight-sentiment-master/q.txt \n", - " creating: wisesight-sentiment-master/word-tokenization/\n", - " inflating: wisesight-sentiment-master/word-tokenization/README.md \n", - " inflating: wisesight-sentiment-master/word-tokenization/data-preparation-and-post-processing.ipynb \n", - " inflating: wisesight-sentiment-master/word-tokenization/wisesight-160-samples-tokenised.label \n", - " inflating: wisesight-sentiment-master/word-tokenization/wisesight-160-samples-tokenised.txt \n", - "master.zip sample_data wisesight_data wisesight-sentiment-master\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "O-eB6ovn_UgH", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - }, - "outputId": "6a6b59e2-e0ed-4184-a0fe-ee55cd66f3b2" - }, - "source": [ - "!cd wisesight-sentiment-master/kaggle-competition; ls" - ], - "execution_count": 2, - "outputs": [ - { - "output_type": "stream", - "text": [ - "competition.ipynb test_majority.csv text_generation.ipynb train.txt\n", - "README.md\t test_solution.csv train_label.txt\n", - "test_label.txt\t test.txt\t train_model.py\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Zs8wtP0m6z5O", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import re\n", - "\n", - "import emoji\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from pythainlp import word_tokenize\n", - "from tqdm import tqdm_notebook\n", - "\n", - "#viz\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cRXpcJp16z5R", - "colab_type": "text" - }, - "source": [ - "## Text Processor for Logistic Regression" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "exvcf5XV6z5R", - "colab_type": "code", - "colab": {} - }, - "source": [ - "def replace_url(text):\n", - " URL_PATTERN = r\"\"\"(?i)\\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\\s()<>{}\\[\\]]+|\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)|\\([^\\s]+?\\))+(?:\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)|\\([^\\s]+?\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’])|(?:(?\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
categorytextsprocessedwcuwc
0neuเห็นคนลบแอพ viu ก็เห็นใจและเข้าใจเขานะคะ แผลมั...เห็น|คน|ลบ|แอพ|viu|ก็|เห็นใจ|และ|เข้าใจ|เขา|นะ...4641
1neuไปชมไม้คิวของแชมป์ และรองแชมป์ กันจ้า! ..........ไป|ชม|ไม้คิว|ของ|แชมป์|และ|รอง|แชมป์|กัน|จ้า|!...4139
2negกลุ่มรถซีวิคเป็นกลุ่มที่น่ารำคานมากกกกกกกกก อว...กลุ่ม|รถ|ซีวิค|เป็น|กลุ่ม|ที่|น่า|รำ|คาน|มาก|x...4635
3neuอยากสวยเหมือนเจ้าของแบรนด์สิคะ เนย โชติกา ใบหน...อยาก|สวย|เหมือน|เจ้าของ|แบรนด์|สิ|คะ|เนย|โชติ|...7256
4negข้าวโถละร้อย แพง เพราะตักเป็นจานๆละ15 เต็มที่ก...ข้าว|โถ|ละ|ร้อย|แพง|เพราะ|ตัก|เป็น|จาน|ๆ|ละ|15...379218
\n", - "" - ], - "text/plain": [ - " category texts ... wc uwc\n", - "0 neu เห็นคนลบแอพ viu ก็เห็นใจและเข้าใจเขานะคะ แผลมั... ... 46 41\n", - "1 neu ไปชมไม้คิวของแชมป์ และรองแชมป์ กันจ้า! .......... ... 41 39\n", - "2 neg กลุ่มรถซีวิคเป็นกลุ่มที่น่ารำคานมากกกกกกกกก อว... ... 46 35\n", - "3 neu อยากสวยเหมือนเจ้าของแบรนด์สิคะ เนย โชติกา ใบหน... ... 72 56\n", - "4 neg ข้าวโถละร้อย แพง เพราะตักเป็นจานๆละ15 เต็มที่ก... ... 379 218\n", - "\n", - "[5 rows x 5 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 13 - } + "data": { + "text/plain": [ + "(2674, 2)" ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with open(\"wisesight-sentiment-master/kaggle-competition/test.txt\") as f:\n", + " texts = [line.strip() for line in f.readlines()]\n", + "\n", + "test_df = pd.DataFrame({\"category\":\"test\", \"texts\":texts})\n", + "test_df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "E5rbH6aJ6z5a" + }, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "yhQ6MX7U6z5a" + }, + "outputs": [], + "source": [ + "all_df[\"processed\"] = all_df.texts.map(lambda x: \"|\".join(process_thai(x)))\n", + "all_df[\"wc\"] = all_df.processed.map(lambda x: len(x.split(\"|\")))\n", + "all_df[\"uwc\"] = all_df.processed.map(lambda x: len(set(x.split(\"|\"))))\n", + "\n", + "test_df[\"processed\"] = test_df.texts.map(lambda x: \"|\".join(process_thai(x)))\n", + "test_df[\"wc\"] = test_df.processed.map(lambda x: len(x.split(\"|\")))\n", + "test_df[\"uwc\"] = test_df.processed.map(lambda x: len(set(x.split(\"|\"))))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 102 }, + "colab_type": "code", + "id": "iEDdQ9hH6z5c", + "outputId": "d27939a6-ee4f-4892-bcc1-40623aca567b" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "GojEjj2k6z5m", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 102 - }, - "outputId": "1596bac3-8a47-49bf-ed6c-59745c422145" - }, - "source": [ - "#prevalence\n", - "print(train_df[\"category\"].value_counts() / train_df.shape[0])" - ], - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "text": [ - "neu 0.544957\n", - "neg 0.253557\n", - "pos 0.180071\n", - "q 0.021415\n", - "Name: category, dtype: float64\n" - ], - "name": "stdout" - } + "data": { + "text/plain": [ + "neu 0.544612\n", + "neg 0.255164\n", + "pos 0.178698\n", + "q 0.021527\n", + "Name: category, dtype: float64" ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#prevalence\n", + "all_df.category.value_counts() / all_df.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "-jPw7Rbu6z5f" + }, + "source": [ + "## Train-validation Split" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "K9b0gp626z5g" + }, + "source": [ + "We perform 85/15 random train-validation split. We also perform under/oversampling to balance out the classes a little." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ioXCNzVX6z5h" + }, + "outputs": [], + "source": [ + "#when finding hyperparameters\n", + "from sklearn.model_selection import train_test_split\n", + "train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1412)\n", + "train_df = train_df.reset_index(drop=True)\n", + "valid_df = valid_df.reset_index(drop=True)\n", + "\n", + "#when actually doing it\n", + "# train_df = all_df.copy()\n", + "# valid_df = pd.read_csv('valid_df.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 }, + "colab_type": "code", + "id": "J5sCacbM6z5j", + "outputId": "03df103a-e045-4dfa-d544-2d51cb374346" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "sH1t3bal6z5o", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 102 - }, - "outputId": "0c50fe48-3671-4940-d4fa-294161e84fa0" - }, - "source": [ - "#prevalence\n", - "print(valid_df[\"category\"].value_counts() / valid_df.shape[0])" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorytextsprocessedwcuwc
0neuเห็นคนลบแอพ viu ก็เห็นใจและเข้าใจเขานะคะ แผลมั...เห็น|คน|ลบ|แอ|พ|viu|ก็|เห็นใจ|และ|เข้าใจ|เขา|น...4843
1neuไปชมไม้คิวของแชมป์ และรองแชมป์ กันจ้า! ..........ไป|ชม|ไม้|คิว|ของ|แชมป์|และ|รอง|แชมป์|กัน|จ้า|...4341
2negกลุ่มรถซีวิคเป็นกลุ่มที่น่ารำคานมากกกกกกกกก อว...กลุ่ม|รถ|ซีวิค|เป็น|กลุ่ม|ที่|น่า|รำ|คาน|มาก|x...4736
3neuอยากสวยเหมือนเจ้าของแบรนด์สิคะ เนย โชติกา ใบหน...อยาก|สวย|เหมือน|เจ้าของ|แบรนด์|สิ|คะ|เนย|โชติ|...7256
4negข้าวโถละร้อย แพง เพราะตักเป็นจานๆละ15 เต็มที่ก...ข้าว|โถ|ละ|ร้อย|แพง|เพราะ|ตัก|เป็น|จาน|ๆ|ละ|15...381218
\n", + "
" ], - "execution_count": 15, - "outputs": [ - { - "output_type": "stream", - "text": [ - "neu 0.542659\n", - "neg 0.264266\n", - "pos 0.170914\n", - "q 0.022161\n", - "Name: category, dtype: float64\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fhsWvG9c6z5q", - "colab_type": "text" - }, - "source": [ - "## Logistic Regression" + "text/plain": [ + " category texts \\\n", + "0 neu เห็นคนลบแอพ viu ก็เห็นใจและเข้าใจเขานะคะ แผลมั... \n", + "1 neu ไปชมไม้คิวของแชมป์ และรองแชมป์ กันจ้า! .......... \n", + "2 neg กลุ่มรถซีวิคเป็นกลุ่มที่น่ารำคานมากกกกกกกกก อว... \n", + "3 neu อยากสวยเหมือนเจ้าของแบรนด์สิคะ เนย โชติกา ใบหน... \n", + "4 neg ข้าวโถละร้อย แพง เพราะตักเป็นจานๆละ15 เต็มที่ก... \n", + "\n", + " processed wc uwc \n", + "0 เห็น|คน|ลบ|แอ|พ|viu|ก็|เห็นใจ|และ|เข้าใจ|เขา|น... 48 43 \n", + "1 ไป|ชม|ไม้|คิว|ของ|แชมป์|และ|รอง|แชมป์|กัน|จ้า|... 43 41 \n", + "2 กลุ่ม|รถ|ซีวิค|เป็น|กลุ่ม|ที่|น่า|รำ|คาน|มาก|x... 47 36 \n", + "3 อยาก|สวย|เหมือน|เจ้าของ|แบรนด์|สิ|คะ|เนย|โชติ|... 72 56 \n", + "4 ข้าว|โถ|ละ|ร้อย|แพง|เพราะ|ตัก|เป็น|จาน|ๆ|ละ|15... 381 218 " ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "valid_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 102 }, + "colab_type": "code", + "id": "GojEjj2k6z5m", + "outputId": "1596bac3-8a47-49bf-ed6c-59745c422145" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "oUAqMvNe6z5q", - "colab_type": "text" - }, - "source": [ - "### Create Features" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "neu 0.544957\n", + "neg 0.253557\n", + "pos 0.180071\n", + "q 0.021415\n", + "Name: category, dtype: float64\n" + ] + } + ], + "source": [ + "#prevalence\n", + "print(train_df[\"category\"].value_counts() / train_df.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 102 }, + "colab_type": "code", + "id": "sH1t3bal6z5o", + "outputId": "0c50fe48-3671-4940-d4fa-294161e84fa0" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "5eI-DEzW6z5r", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#dependent variables\n", - "y_train = train_df[\"category\"]\n", - "y_valid = valid_df[\"category\"]" - ], - "execution_count": 0, - "outputs": [] + "name": "stdout", + "output_type": "stream", + "text": [ + "neu 0.542659\n", + "neg 0.264266\n", + "pos 0.170914\n", + "q 0.022161\n", + "Name: category, dtype: float64\n" + ] + } + ], + "source": [ + "#prevalence\n", + "print(valid_df[\"category\"].value_counts() / valid_df.shape[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "fhsWvG9c6z5q" + }, + "source": [ + "## Logistic Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "oUAqMvNe6z5q" + }, + "source": [ + "### Create Features" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "5eI-DEzW6z5r" + }, + "outputs": [], + "source": [ + "#dependent variables\n", + "y_train = train_df[\"category\"]\n", + "y_valid = valid_df[\"category\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 }, + "colab_type": "code", + "id": "Ry4GTGaC6z5t", + "outputId": "30ae3a5e-b10b-4ec6-d907-5323714b1017" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "Ry4GTGaC6z5t", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "30ae3a5e-b10b-4ec6-d907-5323714b1017" - }, - "source": [ - "#text faetures\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "tfidf = TfidfVectorizer(tokenizer=process_text, ngram_range=(1,2), min_df=20, sublinear_tf=True)\n", - "tfidf_fit = tfidf.fit(all_df[\"texts\"])\n", - "text_train = tfidf_fit.transform(train_df[\"texts\"])\n", - "text_valid = tfidf_fit.transform(valid_df[\"texts\"])\n", - "text_test = tfidf_fit.transform(test_df[\"texts\"])\n", - "text_train.shape, text_valid.shape" - ], - "execution_count": 17, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "((20453, 4563), (3610, 4563))" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 17 - } + "data": { + "text/plain": [ + "((20453, 4614), (3610, 4614))" ] - }, + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#text faetures\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "tfidf = TfidfVectorizer(tokenizer=process_thai, ngram_range=(1,2), min_df=20, sublinear_tf=True)\n", + "tfidf_fit = tfidf.fit(all_df[\"texts\"])\n", + "text_train = tfidf_fit.transform(train_df[\"texts\"])\n", + "text_valid = tfidf_fit.transform(valid_df[\"texts\"])\n", + "text_test = tfidf_fit.transform(test_df[\"texts\"])\n", + "text_train.shape, text_valid.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "7MnvOFdC6z5v", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "outputId": "a80b5dc6-7973-4be4-8ab6-1be4945f5f03" - }, - "source": [ - "#word count and unique word counts; actually might not be so useful\n", - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "scaler = StandardScaler()\n", - "scaler_fit = scaler.fit(all_df[[\"wc\",\"uwc\"]].astype(float))\n", - "print(scaler_fit.mean_, scaler_fit.var_)\n", - "num_train = scaler_fit.transform(train_df[[\"wc\",\"uwc\"]].astype(float))\n", - "num_valid = scaler_fit.transform(valid_df[[\"wc\",\"uwc\"]].astype(float))\n", - "num_test = scaler_fit.transform(test_df[[\"wc\",\"uwc\"]].astype(float))\n", - "num_train.shape, num_valid.shape" - ], - "execution_count": 20, - "outputs": [ - { - "output_type": "stream", - "text": [ - "[21.55059635 17.94551802] [1081.91655857 490.1667113 ]\n" - ], - "name": "stdout" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "((20453, 2), (3610, 2))" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 20 - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 448 ms, sys: 492 ms, total: 940 ms\n", + "Wall time: 938 ms\n", + "(4614, 5)\n" + ] }, { - "cell_type": "code", - "metadata": { - "id": "bj5PA95S6z5w", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "925ecd72-0444-48d0-baa9-1e724d0242f5" - }, - "source": [ - "#concatenate text and word count features\n", - "X_train = np.concatenate([num_train,text_train.toarray()],axis=1)\n", - "X_valid = np.concatenate([num_valid,text_valid.toarray()],axis=1)\n", - "X_test = np.concatenate([num_test,text_test.toarray()],axis=1)\n", - "X_train.shape, X_valid.shape" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rankfeaturescorengramlabel
00ไม่0.0299901neg
11กิน0.0228521neg
22xxrep0.0202521neg
33เลย0.0194931neg
44แต่0.0181531neg
\n", + "
" ], - "execution_count": 21, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "((20453, 4565), (3610, 4565))" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 21 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W4prkGZr6z5y", - "colab_type": "text" - }, - "source": [ - "### Fit Model" + "text/plain": [ + " rank feature score ngram label\n", + "0 0 ไม่ 0.029990 1 neg\n", + "1 1 กิน 0.022852 1 neg\n", + "2 2 xxrep 0.020252 1 neg\n", + "3 3 เลย 0.019493 1 neg\n", + "4 4 แต่ 0.018153 1 neg" ] - }, + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#visualize texts\n", + "from visualize import top_feats_all, plot_top_feats\n", + "features = tfidf_fit.get_feature_names()\n", + "%time ts = top_feats_all(text_train.toarray(), y_train, features)\n", + "print(ts[0].shape)\n", + "ts[0].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "_2IKPcUL6z5z", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "b8e58817-b2ad-440c-9eee-d20e5468a557" - }, - "source": [ - "#fit logistic regression models\n", - "model = LogisticRegression(C=2., penalty=\"l2\", solver=\"liblinear\", dual=False, multi_class=\"ovr\")\n", - "model.fit(X_train,y_train)\n", - "model.score(X_valid,y_valid)" - ], - "execution_count": 22, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "0.7257617728531855" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 22 - } + "data": { + "image/png": "\n", + "text/plain": [ + "
" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "jFGtgFHF6z51", - "colab_type": "text" - }, - "source": [ - "### See Results" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.36 s, sys: 852 ms, total: 2.21 s\n", + "Wall time: 862 ms\n" + ] + } + ], + "source": [ + "%time plot_top_feats(ts)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 }, + "colab_type": "code", + "id": "7MnvOFdC6z5v", + "outputId": "a80b5dc6-7973-4be4-8ab6-1be4945f5f03" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "7cub1c5S6z51", - "colab_type": "code", - "colab": {} - }, - "source": [ - "probs = model.predict_proba(X_valid)\n", - "probs_df = pd.DataFrame(probs)\n", - "probs_df.columns = model.classes_\n", - "probs_df[\"preds\"] = model.predict(X_valid)\n", - "probs_df[\"category\"] = valid_df.category\n", - "probs_df[\"texts\"] = valid_df.texts\n", - "probs_df[\"processed\"] = valid_df.processed\n", - "probs_df[\"wc\"] = valid_df.wc\n", - "probs_df[\"uwc\"] = valid_df.uwc\n", - "probs_df[\"hit\"] = (probs_df.preds==probs_df.category)\n", - "probs_df.to_csv(\"probs_df_linear.csv\", index=False)" - ], - "execution_count": 0, - "outputs": [] + "name": "stdout", + "output_type": "stream", + "text": [ + "[21.96529942 18.22744462] [1151.47512883 513.46009207]\n" + ] }, { - "cell_type": "code", - "metadata": { - "id": "pa4Q0nPS6z54", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - }, - "outputId": "3e392b32-54c6-4fda-e819-af43e9ba0fd7" - }, - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "\n", - "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", - "print(model.score(X_valid,y_valid))\n", - "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", - " xticklabels=model.classes_, yticklabels=model.classes_)\n", - "plt.ylabel(\"Actual\")\n", - "plt.xlabel(\"Predicted\")\n", - "plt.show()" - ], - "execution_count": 24, - "outputs": [ - { - "output_type": "stream", - "text": [ - "0.7257617728531855\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAEKCAYAAADticXcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XeYFFXWx/HvmYDMkCTnJWNGVDCh\ngqIIGFjjK2vCsKyIrrqiYlizggnXhIqiYsSwawbDuqgYABGQHEYxMIBIljjMzHn/6AJHhKGH6epu\nit/Hpx66b1fXPV0Ohzunbt02d0dERKIhI9UBiIhI4iipi4hEiJK6iEiEKKmLiESIkrqISIQoqYuI\nRIiSuohIhCipi4hEiJK6iEiEZKU6gK0Z3Pgs3eoaGF68INUhpI2xS2anOoS0UVRcnOoQ0kZhQb6V\n9xgbFn8Xd87JrtW83P2FRSN1EZEISduRuohIUhUXpTqChFBSFxEBKCpMdQQJoaQuIgK4R+MahZK6\niAhARC48K6mLiABopC4iEiG6UCoiEiEaqYuIRIdr9ouISIToQqmISISo/CIiEiG6UCoiEiEaqYuI\nRIgulIqIRIgulIqIRIe7auoiItGhmrqISISo/CIiEiEaqYuIREjRhlRHkBBK6iIioPKLiEikqPwS\nDRWq5nLk3RdSY7dG4M7/+j3BzxPyANi3dzc6/PNMnmpzEeuWrWKXarkceW9vqjWpQ+H6DYzq9wRL\nZ81L8SdIjDoNanPdA/2pUas67s7bL7zLa0P/Q8u9WnDlwMupsEsFigqLuP+6B5gxaRaHdTmUC646\nj2IvpqiwiIduGsyUr6am+mMkXKNG9Rk69F/UrVMLd2fo0Bd5+JGnGHDn9Rx33NEUFGzgu+9+4K+9\nr2TFipWpDjepju3SiUGDbiUzI4Onnn6Ju+95JNUhlU9ERurm7qmOYYsGNz4rKYEdNehvLBg3ixnD\nPyYjO5OsnF0oWLmGyvVr0OmeC6neogGvdr+BdctWccj1Pdmweh3j//U6u7aozxG39+KtngNCj3F4\n8YLQ+6hZpwY169Rk9tQ55FTK4cn3HuO682/k77dczCtP/Juxo8Zx8FEH0rPP/3HZaVeSk1uRtWvW\nAdB8j+bc8tg/ObvjeaHHOXbJ7ND7KKlevTrUq1eHSZOmUrlyJcZ8OYJTT7uQRo3qM2rU5xQVFXHH\n7dcCcP0N4f8slFSUwiSUkZHBjGmj6dq9J/PmLWDMlyM46+yLmTFjTkriKSzIt/IeY93o5+LOORUP\nP7vc/YUlI9UBpFKFKjk0OGg3Zgz/GIDiDUUUrFwDQIebzuLLO4ZT8h+9Gq0akv/FdACWf7uAKo1r\nkVOratLjDsOSRUuZPTX2F3Lt6rX8MOcHateLjU4rVckFoFKVSiz+eUlsnyChA+TkVoQ0HRyU18KF\ni5g0KfYbyKpVq5k5M4+GDevx3/9+SlFR7GaVseMm0rBR/VSGmXQHtt+Pb7/9nrlzf2TDhg288sqb\nnHjCsakOq1y8aEPcWzrbqZN6lca1Wbv0V44a1JvTRt5Op7svJCtnF5p22Z/VC5exZMaPv9t/8Ywf\nad6tHQB12janSsNaVK5fIxWhh6peo7q02rsl0yfO4KGbBtPnht689tVLXPzPixgy4MlN+x3etQPP\nffI0dw27g4FX3pvCiJOjSZNG7Nt2L8aNm/i79l7nns77749KUVSp0aBhPX6aN3/T83n5C2jQoF4K\nI0oAL45/2wYze8rMFpnZ1BJtN5tZvplNCrbuJV671szyzGyWmR1bor1r0JZnZv3j+RihJnUz+9XM\nVm62/WRmr5tZ8zD7jkdGVia1927K1Gc/4tVuN1C4Zj3t/3EyB1xyIuPue+0P+0945G0qVK3E6e/d\nwT69urB42g8UF0WjDrdRTm5FbnviZh66aTBrVq2hxzkn8PDNj3Jq+548fMtgrrmv36Z9R7/3OWd3\nPI/rL7iRC67qlbqgk6BSpVyGv/Q4/frdzK+/rtrUfs01l1JYWMRLL72ewugkIYqL49+27Rmg6xba\n73f3tsE2AsDM9gTOAPYK3jPYzDLNLBN4BOgG7An0DPYtVdgj9X8BVwENgUZAP+BFYDjw1OY7m1lv\nMxtvZuM/WxV+bW7VgqWsWrCURZO+BeDbEeOovXdTqjSuzenv38lZX9xP5fo1OG3k7eTUrsaGVWsZ\ndeUQXul6PR9d/hgVa1Rh5Y+/hB5nsmRmZXLbEzfz4esf8enIzwDoeloXPhkxGoBRb3/CHm13/8P7\nvhk7hQZ/qk+16tEoRW0uKyuLl4cPYfjwN3jzzfc2tZ999ml079aZc3tdmsLoUmN+/kIaN2qw6Xmj\nhvWZP39hCiNKgASO1N39U2BpnD33AIa7+3p3nwvkAQcGW567f+fuBcTyZo9tHSzspH6iuz/u7r+6\n+0p3HwIc6+4vA9U339ndh7h7O3dvd1jlViGHBmt/WcGqBUvZtXmsHtqow178MvV7ntmvL88fegXP\nH3oFqxYs5dVuN7D2lxVUqJpLRnYmAHv07MSCsTPZsGpt6HEmyzX39eOHvB95Zchvv6Us+XkJbQ/Z\nF4D9D9uPeXPzAWjY9Le/0K33bkV2hQqsWBbN2R+PP34PM2fO4YEHn9jU1uWYTlz5j4s45dTzWbt2\nXSnvjqavxk+iZctmNG3amOzsbE4/vQdvv/NBqsMqn8SO1LfmEjObHJRnNubAhsBPJfaZF7Rtrb1U\nYU9pXGNmpwMbs8SpwMa/AWlxZW30P4dx9EN9yMzOYsWPixh15ZCt7lu9ZQM63/833GHZ7HmMuuqJ\nre67o9mn/d50PbUL307/jqEfPA7AEwOHcvdVg/j7rX3JzMqkYF0B91w9CICO3Y/g2FOPobCwkPXr\nCri5z22pDD80hx7anrPOPJUpU2YwbmxslH7jjXcxaNCtVNilAiPefRGAceMmcMml16Uy1KQqKiri\nsstvYMS7L5KZkcEzw15m+vTkzkxKuDLMUzez3kDvEk1DgkFraR4FbiOW+24D7gPOL2OU244tzCmN\nQd38AeAQYh9kDHAFkA8c4O6fbe29yZrSuCNIxpTGHUWypzSms1ROaUw3iZjSuPbdf8Wdc3KOu3yb\n/ZlZU+Add9+7tNfM7FoAdx8QvPY+cHOw683ufmzQ/rv9tibUkbq7fwecsJWXt5rQRUSSLuQ7Ss2s\nvrtvHKGdBGycGfMW8KKZDQIaAK2AcYABrcysGbGB8BnAX7bVT6hJ3cxaE/uVo27wL1IbYnX228Ps\nV0SkzBL4m4+ZvQR0AmqZ2TzgJqCTmbUlVrX4HvgbgLtPM7NXgOlAIdDXg2/sMLNLgPeBTOApd5+2\nrb7Drqk/QWz2y+MA7j7ZzF4ElNRFJL0kcKTu7j230Dy0lP3vAO7YQvsIYERZ+g47qee6+ziz35Wf\novHtriISLRG5RhF2Ul9sZi0IZrqY2amArvqJSPrRKo1x6QsMAXY3s3xgLnBmyH2KiJRdYTSKCGEn\n9XzgaWAUUANYCZwL3BpyvyIiZRORRenCTupvAsuBCcD8bewrIpI6qqnHpZG7b2lRGxGR9BKRpB72\n2i9fmNk+IfchIlJ+CVzQK5XCHqkfBvQys7nAemJ3SLm7twm5XxGRsgm+9GRHF3ZS7xby8UVEEiMi\n5Zew1375Iczji4gkjJK6iEiEpHmtPF5K6iIigBdrnrqISHSo/CIiEiGa/SIiEiEaqYuIRIiSuohI\nhGhBLxGRCNFIXUQkQjSlMVx3rZmc6hDSRt6sN1IdQtqo31yLfm60fN3qVIcQLZr9IiISHa7yi4hI\nhKj8IiISIVr7RUQkQjRSFxGJkEJdKBURiQ6VX0REIkTlFxGR6NCURhGRKInISD0j1QGIiKSFYo9/\n2wYze8rMFpnZ1BJt95jZTDObbGavm9muJV671szyzGyWmR1bor1r0JZnZv3j+RhK6iIiEFsmIN5t\n254BNl/T4kNgb3dvA8wGrgUwsz2BM4C9gvcMNrNMM8sEHgG6AXsCPYN9S6WkLiJC7DtK4922eSz3\nT4Glm7V94O6FwdMxQKPgcQ9guLuvd/e5QB5wYLDluft37l4ADA/2LZWSuogIlKn8Yma9zWx8ia13\nGXs7HxgZPG4I/FTitXlB29baS6ULpSIiUKb11N19CDBke7oxs+uBQuCF7Xn/tiipi4hAUma/mFkv\n4Higs/umr1rKBxqX2K1R0EYp7Vul8ouICCR09suWmFlX4GrgRHdfU+Klt4AzzGwXM2sGtALGAV8B\nrcysmZlVIHYx9a1t9aORuogI4EWJu/nIzF4COgG1zGwecBOx2S67AB+aGcAYd7/I3aeZ2SvAdGJl\nmb7uXhQc5xLgfSATeMrdp22rbyV1ERFIaPnF3XtuoXloKfvfAdyxhfYRwIiy9K2kLiICcU1V3BEo\nqYuIQGSWCVBSFxEBiMZ6XkrqIiIAXhiNrL5TJ/V7HryFo7p0ZMnipXQ57GQArry2L8d0O5Li4mKW\nLF7KlZf8k0ULf6FqtSrc89CtNGnamPXr13PVpTcxe2Zeij9B+dxw5yA+/XwcNarvyhvPPwbAlf8c\nwPc/zgPg11WrqFK5Mv8e9ghTps/i5rseBMBxLj7/TI7u2IH16ws4t+9VFGzYQFFhEccceRiXXHh2\nyj5TGC7q24uzzjkNd2fG9Nlc2qc/d913E2332wcz+Dbvey7t05/Vq9ds+2ARk5GRwdgxI5mfv5Ae\nJ52b6nDKJxo5Hftt/nt6aVKzTeiBHXjIAaxZvYZBg+/YlNQrV6nEql9XA9Cr919o1bo51/e7netu\n/gerV6/hgXseo0Wrptx29/X85aS/hh0iAHmz3gjluOMnTSE3J4frbrt3U1Iv6Z6HnqBypVz6nH8m\na9etIzsrm6ysTH5ZvJRTzr2Y/735ApmZGaxdu47c3Bw2FBZyTp9+9L/sb+y79x6hxFy/+eZrJIWr\nXv26vPv+i3Q4sDvr1q3nyWf+xX8/+IR33v5g08/JbXdeyy+/LOHB+7frBsPttnzd6qT2tyWXX9ab\nAw5oQ9UqVVKa1AsL8q28x1h2Wqe4c071Vz8ud39h2alvPhr35dcsX7bid20b/6IC5ObmsPH/cqvd\nmvPF6HEAfDvnexo1bkCt2jWSFWoo2rXdh2pVq2zxNXfnvf99SvdjOgGQU7EiWVmZAKwvKIDYPFvM\njNzcHAAKCwspLCwkmIMbGVlZWVTMqUhmZia5uTksXLjodz8nFSvuQroOjsLUsGF9unfrzFNPvZTq\nUBKjuAxbGtupk/rWXHX9pXw5+QP+fOpxDBrwCADTp82m6/GdAdh3/71p2Lg+9RrUTWWYofr6m6nU\nrF6dJo1/Wz9o8rSZ9Djzb5x0Th9uvOqSTUm+qKiIU87tyxHH9+SQ9vvRZq/dUxV2wi1c8DOPPDSU\nSdM+Ztqcz1m58lc+/t/nADw4eADT876gVevmPPn4cymONPkG3XcL/a+9neKIfGNQIldpTKVQk7qZ\n/WpmK4NtnZkVmdnKMPtMhHvueIhD2nThjdfe5dwLY/cQPPrAUKpWq8KIj1+h1197Mm3KTIoTeAda\nuhnx4cd0P6bj79ra7LU7b77wOMOffIAnn3uF9esLAMjMzOTfwx7ho9efY8r02cz57vsURByOartW\npVv3zhywz1Hs3fowcnNzOe3/TgTg7xdfy96tD2P27G/588ndUxxpch3X/WgWLVrMhIlTUh1K4mik\nvm3uXsXdq7p7VSAHOAUYvLX9Sy5nuWrd0q3tljRvvPou3U44GoiVZa669Ea6dzqdK/pcT42a1fnx\nh3kpjjAchYVF/PeTL+ja+Ygtvt6i6Z/Izcn5Q/KuWqUyB+7fhs/GjE9ClMnRsdOh/PDDPJYsWUZh\nYSHvvP0B7Q/ab9PrxcXFvP7au5zQ49hSjhI9hx7ajhOO70Le7DG88PxgjjyyA8OeeTDVYZWLF8a/\npbOklV885g1gqz/97j7E3du5e7vKFVNTr27a/E+bHnfpfiTfzpkLQNWqVcjOjk0WOuPsUxj35YTf\n1VWjZMz4iTRv0oh6dWpvaps3fyGFhbFvfJm/8Gfm/vATDevXZemy5az8dRUA69av58uvJtKsSeMt\nHndHNG/efNq1b0tOTkUAjuh4CLNnfUezEj8nXbt3Zs7s71IVYkpcf8NAmjZvR8vWB3PmWRczatTn\nnNvr76kOq1y8OP4tnYU6pdHMTi7xNANoB6wLs8+yeHDIXRzSoR3Va+7KmCkfcv/AwRx5zOE0b9mU\n4uJi8n9awHX9bgOgZetm3PfI7TgwZ2YeV/39ptQGnwBX3TSQryZOZvnylXT+81lcfMHZnHLCsYz8\n7yd0O7rT7/adMHkaQ597haysLDIyjBv69aX6rtWYlTeX62+/l6LiYrzYOfaow+nU4aDUfKAQTBg/\nmbfffJ//jX6DwsJCpkyewbNPD+f1d56lSpXKmBnTps6k3xU7/s/DTi/Nk3W8Qp3SaGZPl3haCHwP\nPOHui7b13mRMadxRhDWlcUeU7CmN6SwdpjSmi0RMafzlmI5x55zaH36StlO8Qh2pu/t5YR5fRCRR\n0r2sEq+wZ7+0NrOPzGxq8LyNmd0QZp8iItvDiyzuLZ2FfaH0CWILw28AcPfJxL69Q0QkrehCaXxy\n3X3cZncYpvmEIBHZGXlxeo/A4xV2Ul9sZi0gdre9mZ0KLAi5TxGRMkv3EXi8wk7qfYEhwO5mlg/M\nBc4MuU8RkTJz10g9HvnA08AooAawEjgXuDXkfkVEykQj9fi8CSwHJgDzQ+5LRGS7Faf5rJZ4hZ3U\nG7m77hYRkbQXlQulYU9p/MLM9gm5DxGRcvNii3tLZ1sdqZvZ28BWb5t19xPjOP5hQC8zmwusByz2\nVm9T1kBFRMIUle85Ka38cm8Cjt8tAccQEQlduo/A47XVpO7un5T34O7+Q3mPISKSDDvNlEYzawUM\nAPYEKm5sd/fmIcYlIpJURRGZ/RLPhdKngUeJ3d5/JPAs8HyYQYmIJJu7xb2ls3iSeo67f0Rs7fUf\n3P1m4LhwwxIRSa5Ezn4xs8vMbKqZTTOzy4O2Gmb2oZnNCf6sHrSbmT1oZnlmNtnM9i/P54gnqa83\nswxgjpldYmYnAZXL06mISLpxj38rjZntDfwVOBDYFzjezFoC/YGP3L0V8FHwHGITSloFW29ilZHt\nFk9SvwzIBf4OHACcTexWfxGRyEjgSH0PYKy7r3H3QuAT4GSgBzAs2GcY8OfgcQ/g2eB7nMcAu5pZ\n/e39HNu8UOruXwUPVwH6JiMRiaSi4oTdizkVuMPMagJrge7AeKCuu29cpXYhUDd43BD4qcT75wVt\n27WibTyzX0axhZuQ3P2o7elQRCQdleXmIzPrTaxUstEQdx8SO47PMLO7gA+A1cAkoOj3fbmbWSi3\nO8Wz9ku/Eo8rAqegL7oQkYgpLsOsliCBDynl9aHAUAAzu5PY6PtnM6vv7guC8sqiYPd8oHGJtzcK\n2rZLPOWXrzdr+tzMxm1vhyIi6SiRUxXNrI67LzKzPxGrpx8MNCN2PXJg8Oebwe5vAZeY2XDgIGBF\niTJNmcVTfqlR4mkGsYul1ba3QxGRdJTgtV/+HdTUNwB93X25mQ0EXjGzC4AfgNODfUcQq7vnAWso\n57XLeMovXxOrqRuxsstc4ILydBqP5etXh93FDuP4/fqmOoS00bRS3W3vtJOYtO67VIcQKWUpv2yL\nux++hbYlQOcttDuxb4lLiHiS+h7uvq5kg5ntkqgARETSQQJnv6RUPJ/iiy20fZnoQEREUsnLsKWz\n0tZTr0dsrmSOme1HrPwCUJXYzUgiIpGRyPJLKpVWfjkW6EVses19/JbUVwLXhRuWiEhypftCXfEq\nbT31YcAwMzvF3f+dxJhERJKuONUBJEg8NfUDzGzXjU/MrLqZ3R5iTCIiSedY3Fs6iyepd3P35Ruf\nuPsyYnMqRUQio9At7i2dxTOlMdPMdnH39QBmlgNoSqOIREq6j8DjFU9SfwH4yMyeJnaxtBe/LR8p\nIhIJUampx7P2y11m9g1wNLEpmu8DTcIOTEQkmXamkTrAz8QS+mnElgnQbBgRiZTIj9TNrDXQM9gW\nAy8T+57SI5MUm4hI0hTtBCP1mcBo4Hh3zwMwsyuSEpWISJLF8X3SO4TSpjSeTOzrlEaZ2RNm1hki\n8k+ZiMhmirG4t3S21aTu7m+4+xnA7sAo4HKgjpk9amZdkhWgiEgyRGVBr23efOTuq939RXc/gdg6\nMBOBa0KPTEQkiYrLsKWzeGe/AJvuJi31u/lERHZExZbeZZV4lSmpi4hEVVGqA0gQJXUREaIz+0VJ\nXUQE0n5WS7yU1EVESP9ZLfFSUhcRQeWXSKpWrQoPPTKAPfZsjbvTt09/vho3EYBLLr2AOwZcR7Mm\n7Vi6ZFmKI0287F2yue+1e8iukE1mZiajR3zGc4Oe55oHr6ZVm1YUFRYya9JsHuj/IEWFsUtKfW65\niAOPas+6teu57x/3kTf12xR/isSo26AOtzx4PTVq18Ddef35txj+5GtcdPUFdDz2cIqLi1m2ZBk3\nX3Yni39eQsdjD+Oiqy+kuLiYoqIi7rvxQb4ZNyXVHyN0x3bpxKBBt5KZkcFTT7/E3fc8kuqQyiXd\npyrGy9zT85eOapVbJD2wRx+/hy+/+Ipnh71CdnY2ubkVWbHiVxo2rM9Dj9xJq9Yt6Hh4j6Qn9YOr\nt05KPxVzK7JuzToyszIZ9J97efSmx6myaxW+GvUVAP0fvoapY6fyznPv0v7I9vQ47wRuOOdGdt9v\nd/rc8jcuOzH8VSSWFK4KvY+adWpSq25NZk2ZTW6lHJ57fyj9zr+ORfMXsXrVGgD+74JTaN66KQOu\nuY+c3BzWrlkLQMs9WjBwyC2cevhZocc5acl3ofexNRkZGcyYNpqu3Xsyb94Cxnw5grPOvpgZM+ak\nJJ7Cgvxyj7OHNjor7pxzwbzn03ZcH883H+0UqlatTIcO7Xl22CsAbNiwgRUrfgVgwF3Xc+MNd5Gu\n/wAmyro16wDIysoiMysLd9+U0AFmTZpFrfq1ADiky8H8998fATBz4kwqVa1MjTrVkx90CJYsWsKs\nKbMBWLN6Ld/P+Z469WptSugAObk5bPxx2JjQY+0VI/9zAnBg+/349tvvmTv3RzZs2MArr7zJiScc\nm+qwyiUqNx+FmtTN7G4zq2pm2Wb2kZn9YmbhD2G2Q5MmjVm8eCmDH7ub0Z+/xUMP30lubg7djzua\n+fN/ZurUmakOMXQZGRkMfu9hXp70EhNHT2TWpFmbXsvMyqTzyZ0Z//F4AGrVq8kv8xdven3xgsXU\nrFcr6TGHrX6jeuy2T2umTpgOwMX9/8o741+j28nH8Ng9Qzft16nb4bw2+nn+9dzd3HrFwFSFmzQN\nGtbjp3nzNz2fl7+ABg3qpTCi8lNSj08Xd18JHA98D7QErtrazmbW28zGm9n4gg0rQw7t97Kysti3\n7V4MffIFDu9wIqvXrOXa6y7jyn59uPP2+5MaS6oUFxdzcddLOPPAs9mtbWua7Pbbd6Fcekdfpo6d\nytRx01IYYXLl5OZw99Dbue/GBzeN0gcPfILj253KyP98yOnnnbxp349HjubUw8+i3/nXcdHVF6Yq\nZCkHt/i3dBZ2Ut94IfY44FV3X1Hazu4+xN3buXu7CtlVQw7t9/LzF5Cfv5Cvx38DwJtvjGTftnvR\npGljPvvyXSZP+4SGDevx6WdvUadO9EakJa1euZpvvphM+07tADjz8r9QrWY1Hr/1t9UhFi9cQu0G\nv52HWvVrsWTh4j8ca0eVmZXJ3UNv573/fMioEZ/+4fWR//mAzsd1/EP7xDHf0LBJA6rVqJaMMFNm\nfv5CGjdqsOl5o4b1mT9/YQojKj+N1OPzjpnNBA4g9j2ntYF1Ife5XRYtWkx+/gJatmoGQMdOh/LN\npGm0bHYgbfbqSJu9OpKfv5AjDjuRRYuik7w2qlajGpWqVgKgQsUK7H/EfvyU9xNdzziWdh0PYMAl\nv7+mMObDMRx9SmcAdt9vd9b8upqli6IzK+jGQf2ZO+d7Xnj85U1tjZs12vS407GH833ejwA0atpw\nU/tu+7SmQoVsViwtdfyyw/tq/CRatmxG06aNyc7O5vTTe/D2Ox+kOqxyKSrDls5CndLo7v3N7G5g\nhbsXmdlqoEeYfZbH1VfewpND7ye7Qjbfz/2Jvn2uTnVISVOjTnX63d+PjMwMMjKMT98ezdiPxjFi\n7jv8nL+If70xCIDPR37BCw+8yLj/fUX7o9rz9GdPsX7tOu67Mjolqn0P3IfjTuvKnOnf8sKHTwEw\neMAQevzlOJq0+BPFxc6CeQsZcM29AHQ+riPdT+tK4YZC1q9bz7UX3ZTK8JOiqKiIyy6/gRHvvkhm\nRgbPDHuZ6dNnpzqscknkPHUz2xV4Etib2H1N5wOziH2DXFNi5ejT3X2ZmRnwANAdWAP0cvcJ2913\nmFfqzSwb6AMcETR9Ajzm7hu29d5UTGlMV8ma0rgjSMaUxh1FKqc0pptETGm8/0/xT2m84sfSpzSa\n2TBgtLs/aWYVgFzgOmCpuw80s/5AdXe/xsy6A5cSS+oHAQ+4+0Hb+znCLr88Sqz0MjjY9g/aRETS\nSqJq6mZWjdhAdiiAuxe4+3JiVYphwW7DgD8Hj3sAz3rMGGBXM6u/vZ8j7DtK27v7viWe/8/Mvgm5\nTxGRMktgaaAZ8AvwtJntC3wNXAbUdfcFwT4LgbrB44bATyXePy9oW8B2CHukXmRmLTY+MbPmpP91\nBhHZCRVb/FvJ6dfB1rvEobIIqhLuvh+wGuhfsi+P1b1DKTGHPVK/itgXV28s/jUFzgu5TxGRMivL\naNPdS/sGuHnAPHcfGzx/jVhS/9nM6rv7gqC8sih4PR9oXOL9jYK27RL2SP1z4HFiZailweMvQ+5T\nRKTMivG4t9K4+0LgJzPbLWjqDEwH3gLODdrOBd4MHr8FnGMxBxObLbhdpRcIf6T+LLASuC14/hfg\nOeC0kPsVESmTBN9UdCnwQjDz5TtiFYoM4BUzuwD4ATg92HcEsZkvecSmNJarmhF2Ut/b3fcs8XyU\nmU0PuU8RkTJLZIHb3ScB7bbwUuct7OtA30T1HXb5ZULw6wQAZnYQMD7kPkVEyiwqywSEPVI/APjC\nzH4Mnv8JmGVmU4j9A9Um5P4Jw+0sAAAL5UlEQVRFROJSaNG43zHspN415OOLiCRENFJ6+Gu//BDm\n8UVEEiXdyyrx0neUiojANqcq7iiU1EVEUPlFRCRSVH4REYmQooiM1ZXURUTQSF1EJFJcI3URkejQ\nSF1EJEI0pVFEJEKikdKV1EVEACiMSFpXUhcRQRdKQ7e6YF2qQ0gbY5bNTnUIaUM/FxIWXSgVEYkQ\njdRFRCJEI3URkQgpco3URUQiQ/PURUQiRDV1EZEIUU1dRCRCVH4REYkQlV9ERCJEs19ERCJE5RcR\nkQjRhVIRkQiJSk09I9UBiIikg2I87q00ZlbRzMaZ2TdmNs3Mbgnam5nZWDPLM7OXzaxC0L5L8Dwv\neL1peT6HkrqICODucW/bsB44yt33BdoCXc3sYOAu4H53bwksAy4I9r8AWBa03x/st92U1EVEgCI8\n7q00HrMqeJodbA4cBbwWtA8D/hw87hE8J3i9s5nZ9n4OJXURERJXfgEws0wzmwQsAj4EvgWWu3th\nsMs8oGHwuCHwE0Dw+gqg5vZ+DiV1ERHKVn4xs95mNr7E1nuzYxW5e1ugEXAgsHuyPodmv4iIULZ5\n6u4+BBgSx37LzWwUcAiwq5llBaPxRkB+sFs+0BiYZ2ZZQDVgSRnD30QjdRERYlMa4/2vNGZW28x2\nDR7nAMcAM4BRwKnBbucCbwaP3wqeE7z+P4/jauzWaKQuIkJClwmoDwwzs0xiA+dX3P0dM5sODDez\n24GJwNBg/6HAc2aWBywFzihP50rqIiIkbpkAd58M7LeF9u+I1dc3b18HnJaQzlFSFxEBtPbLTiEj\nI4OxY0YyP38hPU46d9tviJBq1arw0CMD2GPP1rg7ffv056txE+l90Tn8tfdZFBUV8cF7H3PjP8t1\nn8QOJ2/2GH5dtYqiomIKCws5+JDuqQ4pZaJ2LspRxk4rSuql+PulFzJz5hyqVqmS6lCSbuDdN/Lf\nDz/lnLMuITs7m9zcihx+xMEcd9zRdDj4eAoKCqhVe7un0u7Qjj7mNJYsWZbqMNJClM6FRupxMLN/\nlPa6uw8Ks//yaNiwPt27dWbAwAe5/LLe235DhFStWpkOHdrT529XAbBhwwZWrNjABRf+hfvve4yC\nggIAFv+y3bOuRNKOFvSKTzugD7E7phoCFwH7A1WCLW0Nuu8W+l97O8XFUVmQM35NmjRm8eKlDH7s\nbkZ//hYPPXwnubk5tGjZjEM6tOejUf/m3fdeZP/990l1qEnn7owc8RJjx4zkwgvOTHU4KRW1c1Hk\nxXFv6Szs8ksjYH93/xXAzG4G3nX3s0Lut1yO6340ixYtZsLEKXQ84pBUh5N0WVlZ7Nt2L67qdwtf\nj/+GgXf/kyuuvIisrCyqV9+Vzkeewv4HtOGZZx+izd6dUh1uUnU88iTmz19I7do1eW/kcGbNymP0\nZ2NTHVZKRO1cRKWmHvZIvS5QUOJ5QdC2RSVvvS0uXh1yaFt36KHtOOH4LuTNHsMLzw/myCM7MOyZ\nB1MWT7Ll5y8gP38hX4//BoA33xjJvvvuxfz8hbz91vsATPh6MsXFxdSsVSOVoSbd/PkLAfjllyW8\n+eZI2rdvm+KIUidq5yKRa7+kUthJ/VlgnJndHIzSxwLPbG1ndx/i7u3cvV1GRqWQQ9u6628YSNPm\n7WjZ+mDOPOtiRo36nHN7/T1l8STbokWLyc9fQMtWzQDo2OlQZs3M4913PuDwIw4GoEXLpmRXqMCS\nxUtTGWpS5ebmULlypU2Pjzm6I9OmzUpxVKkRxXORqDtKUy3U8ou732FmI4HDg6bz3H1imH1KYlx9\n5S08OfR+sitk8/3cn+jb52pWr17LI48O5MtxI9lQULDpQurOom7d2rz2auwmwKysTIYPf4P3P/g4\ntUGlSBTPRXFEyi+WrnWkrAoN0zOwFKhUoWKqQ0gbqwvWpToESUOFBfnbvf74RnvVPSjunDPt57Hl\n7i8smqcuIgJpP6slXkrqIiJEp/yipC4iQnRuPlJSFxFBI3URkUjRSF1EJEKKvCjVISSEkrqICNFZ\nJkBJXUQELb0rIhIpGqmLiESIZr+IiESIZr+IiESIlgkQEYkQ1dRFRCJENXURkQjRSF1EJEI0T11E\nJEI0UhcRiRDNfhERiZCoXCjNSHUAIiLpwN3j3rbFzLqa2SwzyzOz/kkIfxMldRERYneUxvtfacws\nE3gE6AbsCfQ0sz2T8BEAJXURESChI/UDgTx3/87dC4DhQI/QP0BANXURERJaU28I/FTi+TzgoEQd\nfFvSNqkXFuRbqmMAMLPe7j4k1XGkA52L3+hc/CYq56IsOcfMegO9SzQNSZdzoPLLtvXe9i47DZ2L\n3+hc/GanOxfuPsTd25XYSib0fKBxieeNgrakUFIXEUmsr4BWZtbMzCoAZwBvJavztC2/iIjsiNy9\n0MwuAd4HMoGn3H1asvpXUt+2tKiTpQmdi9/oXPxG52Iz7j4CGJGKvi0q6x2IiIhq6iIikaKkLiIS\nIUrqIiIRslMndTNramYzzOwJM5tmZh+YWY6ZtTCz98zsazMbbWa7B/u3MLMxZjbFzG43s1Wp/gyJ\nsh3n4hkzO7XE+yNzLmDT+ZhpZi8E5+U1M8s1s85mNjH4GXjKzHYJ9h9oZtPNbLKZ3Zvq+JPBzK43\ns9lm9pmZvWRm/VIdk+zkST3QCnjE3fcClgOnELuaf6m7HwD0AwYH+z4APODu+xC79TdqynIudga7\nAYPdfQ9gJfAP4Bng/4KfgSygj5nVBE4C9nL3NsDtKYo3aczsAGLzr9sC3YH2qY1INlJSh7nuPil4\n/DXQFDgUeNXMJgGPA/WD1w8BXg0ev5jMIJOkLOdiZ/CTu38ePH4e6EzsHM0O2oYBRwArgHXAUDM7\nGViT9EiT73DgdXdf4+4rSeLNNVI6zVOH9SUeFwF1geXu3jZF8aRSWc5FIcGgwMwygArhh5d0m8/3\nXQ7U/MNOsZtNDiSW9E8FLgGOCj88kT/SSP2PVgJzzew0AIvZN3htDLGSBMR+9Yy60s7F98ABweMT\ngezkhxe6P5nZIcHjvwDjgaZm1jJoOxv4xMwqA9WCG06uAPb946Ei51Pgz8F1lyrACakOSGKU1Lfs\nTOACM/sGmMZvayFfDvzDzCYDLYn92h11WzsXTwAdg/ZDgNUpii9Ms4C+ZjYDqA7cD5xHrBw1BSgG\nHgOqAO8EPxefEau9R5q7TwBeBr4BRhJb70TSgO4oLQMzywXWurub2RlAT3dP2uL3kjxm1hR4x933\nTnEoOwQzuxlY5e47xcyfdKaaetkcADxsZkasvnp+iuMREfkdjdRFRCJENXURkQhRUhcRiRAldRGR\nCFFSl4QzsyIzm2RmU83s1WDW0PYeq5OZvRM8PtHM+pey765mdvF29HGz1i2RqFBSlzCsdfe2wXTA\nAuCiki8GNzGV+WfP3d9y94Gl7LIrUOakLhIlSuoSttFAy2DVw1lm9iwwFWhsZl3M7EszmxCM6CsD\nmFnXYIXECcDJGw9kZr3M7OHgcV0ze93Mvgm2Q4GBQIvgt4R7gv2uMrOvgtUTbylxrE0rDBJbuEsk\nEjRPXUJjZllAN+C9oKkVcK67jzGzWsANwNHuvtrMriF2t+7dxO5WPQrII3bX4pY8CHzi7ieZWSZQ\nGegP7L1xrRoz6xL0eSBgwFtmdgSxu183rjCYBUwgtoCZyA5PSV3CkBOs6gixkfpQoAHwg7uPCdoP\nBvYEPo/dy0UF4Etgd2IrIc4BMLPngd5b6OMo4BwAdy8CVphZ9c326RJsE4PnlYkl+SoEKwwGfWiF\nQYkMJXUJw9rNV3YMEnfJ9WEM+NDde262XyJXxzRggLs/vlkflyewD5G0opq6pMoYoMPGFQ/NrJKZ\ntQZmElsJsUWwX8+tvP8joE/w3kwzqwb8SmwUvtH7wPklavUNzawOWmFQIkxJXVLC3X8BegEvBasb\nfgns7u7riJVb3g0ulC7ayiEuA44MVkv8GtjT3ZcQK+dMNbN73P0DYl9m8mWw32tAFa0wKFGmtV9E\nRCJEI3URkQhRUhcRiRAldRGRCFFSFxGJECV1EZEIUVIXEYkQJXURkQhRUhcRiZD/B16fsAOjNHqO\nAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } + "data": { + "text/plain": [ + "((20453, 2), (3610, 2))" ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#word count and unique word counts; actually might not be so useful\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler()\n", + "scaler_fit = scaler.fit(all_df[[\"wc\",\"uwc\"]].astype(float))\n", + "print(scaler_fit.mean_, scaler_fit.var_)\n", + "num_train = scaler_fit.transform(train_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_valid = scaler_fit.transform(valid_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_test = scaler_fit.transform(test_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_train.shape, num_valid.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 }, + "colab_type": "code", + "id": "bj5PA95S6z5w", + "outputId": "925ecd72-0444-48d0-baa9-1e724d0242f5" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "KIfXHEVn6z58", - "colab_type": "text" - }, - "source": [ - "## [ULMFit](https://github.com/cstorm125/thai2fit) Model" + "data": { + "text/plain": [ + "((20453, 4616), (3610, 4616))" ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#concatenate text and word count features\n", + "X_train = np.concatenate([num_train,text_train.toarray()],axis=1)\n", + "X_valid = np.concatenate([num_valid,text_valid.toarray()],axis=1)\n", + "X_test = np.concatenate([num_test,text_test.toarray()],axis=1)\n", + "X_train.shape, X_valid.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "W4prkGZr6z5y" + }, + "source": [ + "### Fit Model" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 }, + "colab_type": "code", + "id": "_2IKPcUL6z5z", + "outputId": "b8e58817-b2ad-440c-9eee-d20e5468a557" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "xPQoEbfC6z58", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from fastai.text import *\n", - "from fastai.callbacks import CSVLogger, SaveModelCallback\n", - "from pythainlp.ulmfit import *\n", - "\n", - "model_path = \"wisesight_data/\"\n", - "all_df = pd.read_csv(\"all_df.csv\")\n", - "train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1412)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RIUQEyb96z5-", - "colab_type": "text" - }, - "source": [ - "### Finetune Language Model" + "data": { + "text/plain": [ + "0.7324099722991689" ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#fit logistic regression models\n", + "model = LogisticRegression(C=2., penalty=\"l2\", solver=\"liblinear\", dual=False, multi_class=\"ovr\")\n", + "model.fit(X_train,y_train)\n", + "model.score(X_valid,y_valid)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "jFGtgFHF6z51" + }, + "source": [ + "### See Results" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7cub1c5S6z51" + }, + "outputs": [], + "source": [ + "probs = model.predict_proba(X_valid)\n", + "probs_df = pd.DataFrame(probs)\n", + "probs_df.columns = model.classes_\n", + "probs_df[\"preds\"] = model.predict(X_valid)\n", + "probs_df[\"category\"] = valid_df.category\n", + "probs_df[\"texts\"] = valid_df.texts\n", + "probs_df[\"processed\"] = valid_df.processed\n", + "probs_df[\"wc\"] = valid_df.wc\n", + "probs_df[\"uwc\"] = valid_df.uwc\n", + "probs_df[\"hit\"] = (probs_df.preds==probs_df.category)\n", + "probs_df.to_csv(\"probs_df_linear.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 }, + "colab_type": "code", + "id": "pa4Q0nPS6z54", + "outputId": "3e392b32-54c6-4fda-e819-af43e9ba0fd7" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "XGpprjbp6z5_", - "colab_type": "code", - "colab": {} - }, - "source": [ - "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", - "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", - " NumericalizeProcessor(vocab=None, max_vocab=60000, min_freq=2)]\n", - "\n", - "data_lm = (TextList.from_df(all_df, model_path, cols=\"texts\", processor=processor)\n", - " .split_by_rand_pct(valid_pct = 0.01, seed = 1412)\n", - " .label_for_lm()\n", - " .databunch(bs=48))\n", - "data_lm.sanity_check()\n", - "data_lm.save('wisesight_lm.pkl')" - ], - "execution_count": 0, - "outputs": [] + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7324099722991689\n" + ] }, { - "cell_type": "code", - "metadata": { - "id": "8GiTvaHX6z6A", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "4ca15151-7db8-4bc8-d035-ddcbfc3383e1" - }, - "source": [ - "data_lm.sanity_check()\n", - "len(data_lm.train_ds), len(data_lm.valid_ds)" - ], - "execution_count": 32, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(23823, 240)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 32 - } + "data": { + "image/png": "\n", + "text/plain": [ + "
" ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "\n", + "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", + "print(model.score(X_valid,y_valid))\n", + "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", + " xticklabels=model.classes_, yticklabels=model.classes_)\n", + "plt.ylabel(\"Actual\")\n", + "plt.xlabel(\"Predicted\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KIfXHEVn6z58" + }, + "source": [ + "## [ULMFit](https://github.com/cstorm125/thai2fit) Model" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "xPQoEbfC6z58" + }, + "outputs": [], + "source": [ + "from fastai.text import *\n", + "from fastai.callbacks import CSVLogger, SaveModelCallback\n", + "from pythainlp.ulmfit import *\n", + "\n", + "model_path = \"wisesight_data/\"\n", + "all_df = pd.read_csv(\"all_df.csv\")\n", + "train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1412)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "RIUQEyb96z5-" + }, + "source": [ + "### Finetune Language Model" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "XGpprjbp6z5_" + }, + "outputs": [], + "source": [ + "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", + "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", + " NumericalizeProcessor(vocab=None, max_vocab=60000, min_freq=2)]\n", + "\n", + "data_lm = (TextList.from_df(all_df, model_path, cols=\"texts\", processor=processor)\n", + " .split_by_rand_pct(valid_pct = 0.01, seed = 1412)\n", + " .label_for_lm()\n", + " .databunch(bs=48))\n", + "data_lm.sanity_check()\n", + "data_lm.save('wisesight_lm.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 }, + "colab_type": "code", + "id": "8GiTvaHX6z6A", + "outputId": "4ca15151-7db8-4bc8-d035-ddcbfc3383e1" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "Bm7PYDIC6z6E", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "outputId": "f768001e-09ee-4572-ea48-ac9e38e7296a" - }, - "source": [ - "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,\n", - " output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)\n", - "trn_args = dict(drop_mult=1., clip=0.12, alpha=2, beta=1)\n", - "\n", - "learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", - "\n", - "#load pretrained models\n", - "learn.load_pretrained(**_THWIKI_LSTM)" - ], - "execution_count": 33, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "LanguageLearner(data=TextLMDataBunch;\n", - "\n", - "Train: LabelList (23823 items)\n", - "x: LMTextList\n", - "xxbos ประเทศ เรา ผลิต และ ส่งออก ยาสูบ เยอะ สุด ใน โลก จิง ป่าว คับ,xxbos คะ,xxbos อิ เหี้ย ออม ทำ กู อยาก กิน เอ็ม เค,xxbos xxwrep 2 😅,xxbos สวัสดี วัน พุธ แนน อะไร นะ\n", - "y: LMLabelList\n", - ",,,,\n", - "Path: wisesight_data;\n", - "\n", - "Valid: LabelList (240 items)\n", - "x: LMTextList\n", - "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", - "y: LMLabelList\n", - ",,,,\n", - "Path: wisesight_data;\n", - "\n", - "Test: None, model=SequentialRNN(\n", - " (0): AWD_LSTM(\n", - " (encoder): Embedding(15000, 400, padding_idx=1)\n", - " (encoder_dp): EmbeddingDropout(\n", - " (emb): Embedding(15000, 400, padding_idx=1)\n", - " )\n", - " (rnns): ModuleList(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (2): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (3): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " )\n", - " (input_dp): RNNDropout()\n", - " (hidden_dps): ModuleList(\n", - " (0): RNNDropout()\n", - " (1): RNNDropout()\n", - " (2): RNNDropout()\n", - " (3): RNNDropout()\n", - " )\n", - " )\n", - " (1): LinearDecoder(\n", - " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", - " (output_dp): RNNDropout()\n", - " )\n", - "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[RNNTrainer\n", - "learn: LanguageLearner(data=TextLMDataBunch;\n", - "\n", - "Train: LabelList (23823 items)\n", - "x: LMTextList\n", - "xxbos ประเทศ เรา ผลิต และ ส่งออก ยาสูบ เยอะ สุด ใน โลก จิง ป่าว คับ,xxbos คะ,xxbos อิ เหี้ย ออม ทำ กู อยาก กิน เอ็ม เค,xxbos xxwrep 2 😅,xxbos สวัสดี วัน พุธ แนน อะไร นะ\n", - "y: LMLabelList\n", - ",,,,\n", - "Path: wisesight_data;\n", - "\n", - "Valid: LabelList (240 items)\n", - "x: LMTextList\n", - "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", - "y: LMLabelList\n", - ",,,,\n", - "Path: wisesight_data;\n", - "\n", - "Test: None, model=SequentialRNN(\n", - " (0): AWD_LSTM(\n", - " (encoder): Embedding(15000, 400, padding_idx=1)\n", - " (encoder_dp): EmbeddingDropout(\n", - " (emb): Embedding(15000, 400, padding_idx=1)\n", - " )\n", - " (rnns): ModuleList(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (2): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (3): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " )\n", - " (input_dp): RNNDropout()\n", - " (hidden_dps): ModuleList(\n", - " (0): RNNDropout()\n", - " (1): RNNDropout()\n", - " (2): RNNDropout()\n", - " (3): RNNDropout()\n", - " )\n", - " )\n", - " (1): LinearDecoder(\n", - " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", - " (output_dp): RNNDropout()\n", - " )\n", - "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[...], layer_groups=[Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): Embedding(15000, 400, padding_idx=1)\n", - " (1): EmbeddingDropout(\n", - " (emb): Embedding(15000, 400, padding_idx=1)\n", - " )\n", - " (2): LinearDecoder(\n", - " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", - " (output_dp): RNNDropout()\n", - " )\n", - ")], add_time=True, silent=False, cb_fns_registered=False)\n", - "alpha: 2\n", - "beta: 1], layer_groups=[Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): Embedding(15000, 400, padding_idx=1)\n", - " (1): EmbeddingDropout(\n", - " (emb): Embedding(15000, 400, padding_idx=1)\n", - " )\n", - " (2): LinearDecoder(\n", - " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", - " (output_dp): RNNDropout()\n", - " )\n", - ")], add_time=True, silent=False, cb_fns_registered=False)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 33 - } + "data": { + "text/plain": [ + "(23823, 240)" ] + }, + "execution_count": 32, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "data_lm.sanity_check()\n", + "len(data_lm.train_ds), len(data_lm.valid_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "colab_type": "code", + "id": "Bm7PYDIC6z6E", + "outputId": "f768001e-09ee-4572-ea48-ac9e38e7296a" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "uJK68vJT6z6G", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 97 - }, - "outputId": "9bde3724-568f-4630-afb7-df40b3aff0d4" - }, - "source": [ - "#train frozen\n", - "print(\"training frozen\")\n", - "learn.freeze_to(-1)\n", - "learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))" - ], - "execution_count": 34, - "outputs": [ - { - "output_type": "stream", - "text": [ - "training frozen\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
04.8411874.4627140.31974202:47
" - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - } + "data": { + "text/plain": [ + "LanguageLearner(data=TextLMDataBunch;\n", + "\n", + "Train: LabelList (23823 items)\n", + "x: LMTextList\n", + "xxbos ประเทศ เรา ผลิต และ ส่งออก ยาสูบ เยอะ สุด ใน โลก จิง ป่าว คับ,xxbos คะ,xxbos อิ เหี้ย ออม ทำ กู อยาก กิน เอ็ม เค,xxbos xxwrep 2 😅,xxbos สวัสดี วัน พุธ แนน อะไร นะ\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: wisesight_data;\n", + "\n", + "Valid: LabelList (240 items)\n", + "x: LMTextList\n", + "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: wisesight_data;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): AWD_LSTM(\n", + " (encoder): Embedding(15000, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " (1): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[RNNTrainer\n", + "learn: LanguageLearner(data=TextLMDataBunch;\n", + "\n", + "Train: LabelList (23823 items)\n", + "x: LMTextList\n", + "xxbos ประเทศ เรา ผลิต และ ส่งออก ยาสูบ เยอะ สุด ใน โลก จิง ป่าว คับ,xxbos คะ,xxbos อิ เหี้ย ออม ทำ กู อยาก กิน เอ็ม เค,xxbos xxwrep 2 😅,xxbos สวัสดี วัน พุธ แนน อะไร นะ\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: wisesight_data;\n", + "\n", + "Valid: LabelList (240 items)\n", + "x: LMTextList\n", + "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: wisesight_data;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): AWD_LSTM(\n", + " (encoder): Embedding(15000, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " (1): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[...], layer_groups=[Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): Embedding(15000, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (2): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)\n", + "alpha: 2\n", + "beta: 1], layer_groups=[Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): Embedding(15000, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (2): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)" ] + }, + "execution_count": 33, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,\n", + " output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)\n", + "trn_args = dict(drop_mult=1., clip=0.12, alpha=2, beta=1)\n", + "\n", + "learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", + "\n", + "#load pretrained models\n", + "learn.load_pretrained(**_THWIKI_LSTM)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 97 }, + "colab_type": "code", + "id": "uJK68vJT6z6G", + "outputId": "9bde3724-568f-4630-afb7-df40b3aff0d4" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "axooWmsg6z6I", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 221 - }, - "outputId": "819107f9-5d37-49ef-dac6-7cde4204656b" - }, - "source": [ - "#train unfrozen\n", - "print(\"training unfrozen\")\n", - "learn.unfreeze()\n", - "learn.fit_one_cycle(5, 1e-3, moms=(0.8, 0.7))" - ], - "execution_count": 35, - "outputs": [ - { - "output_type": "stream", - "text": [ - "training unfrozen\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
04.4118344.2055520.34176603:31
14.1780304.0370950.36150803:31
23.9703883.9309190.37013903:31
33.7561903.8903980.37619103:31
43.6717043.8902320.37559503:31
" - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "training frozen\n" + ] }, { - "cell_type": "code", - "metadata": { - "id": "OZC4BGnB6z6L", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# learn.save('wisesight_lm')\n", - "learn.save_encoder(\"wisesight_enc\")" + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
04.8411874.4627140.31974202:47
" ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hTTQ76Ls6z6N", - "colab_type": "text" - }, - "source": [ - "### Train Text Classifier" + "text/plain": [ + "" ] + }, + "metadata": { + "tags": [] + }, + "output_type": "display_data" + } + ], + "source": [ + "#train frozen\n", + "print(\"training frozen\")\n", + "learn.freeze_to(-1)\n", + "learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 }, + "colab_type": "code", + "id": "axooWmsg6z6I", + "outputId": "819107f9-5d37-49ef-dac6-7cde4204656b" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "A2Z09Mf26z6N", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "27cfec82-23e3-4e0b-dd0d-7b4ba855e646" - }, - "source": [ - "#lm data\n", - "data_lm = load_data(model_path, \"wisesight_lm.pkl\")\n", - "data_lm.sanity_check()\n", - "\n", - "#classification data\n", - "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", - "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", - " NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=20)]\n", - "\n", - "data_cls = (ItemLists(model_path,train=TextList.from_df(train_df, model_path, cols=[\"texts\"], processor=processor),\n", - " valid=TextList.from_df(valid_df, model_path, cols=[\"texts\"], processor=processor))\n", - " .label_from_df(\"category\")\n", - " .databunch(bs=50)\n", - " )\n", - "data_cls.sanity_check()\n", - "print(len(data_cls.vocab.itos))" - ], - "execution_count": 39, - "outputs": [ - { - "output_type": "stream", - "text": [ - "15000\n" - ], - "name": "stdout" - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "training unfrozen\n" + ] }, { - "cell_type": "code", - "metadata": { - "id": "RjRFWx8-6z6P", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "outputId": "76a4bb55-a3cd-4d51-d5fc-6c2fec878573" - }, - "source": [ - "#model\n", - "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,\n", - " output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5)\n", - "trn_args = dict(bptt=70, drop_mult=0.7, alpha=2, beta=1, max_len=500)\n", - "\n", - "learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", - "#load pretrained finetuned model\n", - "learn.load_encoder(\"wisesight_enc\")" + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
04.4118344.2055520.34176603:31
14.1780304.0370950.36150803:31
23.9703883.9309190.37013903:31
33.7561903.8903980.37619103:31
43.6717043.8902320.37559503:31
" ], - "execution_count": 40, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "RNNLearner(data=TextClasDataBunch;\n", - "\n", - "Train: LabelList (20453 items)\n", - "x: TextList\n", - "xxbos กันแดด คิว เพลส ตัวใหม่ นี่ คุม มัน ดีจริง อ่ะ นี่ หน้า มัน ยิ่ง ที โซน ยิ่ง มัน เยอะ นีเวีย หลอด ยาว ๆ ฝา เขียว ก็ เอา ไม่อยู่ อ่ะ แล้ว xxunk,xxbos พบ กับ การ ร่วม ตัว ของ ศิลปิน soul pop สาม ยุค สาม สไตล์ ใน งาน jamnight อะไร ก็ช่าง xxunk ( ชุ่ย ) นำ ทีม โดย soul after six , the parkinson และ the xxup toys งาน นี้ นอกจาก จะ ได้ ดู โชว์ แบบ เต็ม รูปแบบ จาก ทั้ง สาม วง แล้ว ยัง มี โชว์ สุด พิเศษ ที่ ทั้ง สาม จะ ร่วม แจม กัน ด้วย ไม่ อยาก ให้ พลาด เจอกัน วันที่ 29 กันยายน นี้ ที่ ช่าง ชุ่ย ประตู เปิด 19.00 น. เป็นต้นไป สามารถ ซื้อ บัตร ได้ แล้ว ที่ event pop : http : / / go . eventpop . me / jamnight * จำกัด ผู้ ที่ มีอายุ 20 ปี ขึ้นไป # jamnightbyjameson # jamesonthailand # soulaftersix # theparkinson # thetoys,xxbos 👌 🏻 👌 🏻 👌 🏻 xxwrep 2 😆,xxbos จ - ศ แถม ถึง 29 ไม่ทัน มะ ว้า xxrep 4 ,xxbos ใช้ ดี ค่ะ บอกต่อ คือ เป็น คน แพ้ ง่าย มาก กก ใช้ กา นิ เย หรือ พอน ก็ แพ้ แต่ ใช้ ครีม แตงโม แล้ว คือ ดี สิว ลด กันน้ำ ค่ะ นี้ ใช้ ไป เล่น สงกรานต์ มา รอด ค่ะ 555 เล่อ ค่า xxrep 5 \n", - "y: CategoryList\n", - "neg,neu,neu,neu,neg\n", - "Path: wisesight_data;\n", - "\n", - "Valid: LabelList (3610 items)\n", - "x: TextList\n", - "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", - "y: CategoryList\n", - "neu,neu,neg,neu,neg\n", - "Path: wisesight_data;\n", - "\n", - "Test: None, model=SequentialRNN(\n", - " (0): MultiBatchEncoder(\n", - " (module): AWD_LSTM(\n", - " (encoder): Embedding(15000, 400, padding_idx=1)\n", - " (encoder_dp): EmbeddingDropout(\n", - " (emb): Embedding(15000, 400, padding_idx=1)\n", - " )\n", - " (rnns): ModuleList(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (2): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (3): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " )\n", - " (input_dp): RNNDropout()\n", - " (hidden_dps): ModuleList(\n", - " (0): RNNDropout()\n", - " (1): RNNDropout()\n", - " (2): RNNDropout()\n", - " (3): RNNDropout()\n", - " )\n", - " )\n", - " )\n", - " (1): PoolingLinearClassifier(\n", - " (layers): Sequential(\n", - " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (1): Dropout(p=0.27999999999999997)\n", - " (2): Linear(in_features=1200, out_features=50, bias=True)\n", - " (3): ReLU(inplace)\n", - " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (5): Dropout(p=0.1)\n", - " (6): Linear(in_features=50, out_features=4, bias=True)\n", - " )\n", - " )\n", - "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[RNNTrainer\n", - "learn: RNNLearner(data=TextClasDataBunch;\n", - "\n", - "Train: LabelList (20453 items)\n", - "x: TextList\n", - "xxbos กันแดด คิว เพลส ตัวใหม่ นี่ คุม มัน ดีจริง อ่ะ นี่ หน้า มัน ยิ่ง ที โซน ยิ่ง มัน เยอะ นีเวีย หลอด ยาว ๆ ฝา เขียว ก็ เอา ไม่อยู่ อ่ะ แล้ว xxunk,xxbos พบ กับ การ ร่วม ตัว ของ ศิลปิน soul pop สาม ยุค สาม สไตล์ ใน งาน jamnight อะไร ก็ช่าง xxunk ( ชุ่ย ) นำ ทีม โดย soul after six , the parkinson และ the xxup toys งาน นี้ นอกจาก จะ ได้ ดู โชว์ แบบ เต็ม รูปแบบ จาก ทั้ง สาม วง แล้ว ยัง มี โชว์ สุด พิเศษ ที่ ทั้ง สาม จะ ร่วม แจม กัน ด้วย ไม่ อยาก ให้ พลาด เจอกัน วันที่ 29 กันยายน นี้ ที่ ช่าง ชุ่ย ประตู เปิด 19.00 น. เป็นต้นไป สามารถ ซื้อ บัตร ได้ แล้ว ที่ event pop : http : / / go . eventpop . me / jamnight * จำกัด ผู้ ที่ มีอายุ 20 ปี ขึ้นไป # jamnightbyjameson # jamesonthailand # soulaftersix # theparkinson # thetoys,xxbos 👌 🏻 👌 🏻 👌 🏻 xxwrep 2 😆,xxbos จ - ศ แถม ถึง 29 ไม่ทัน มะ ว้า xxrep 4 ,xxbos ใช้ ดี ค่ะ บอกต่อ คือ เป็น คน แพ้ ง่าย มาก กก ใช้ กา นิ เย หรือ พอน ก็ แพ้ แต่ ใช้ ครีม แตงโม แล้ว คือ ดี สิว ลด กันน้ำ ค่ะ นี้ ใช้ ไป เล่น สงกรานต์ มา รอด ค่ะ 555 เล่อ ค่า xxrep 5 \n", - "y: CategoryList\n", - "neg,neu,neu,neu,neg\n", - "Path: wisesight_data;\n", - "\n", - "Valid: LabelList (3610 items)\n", - "x: TextList\n", - "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", - "y: CategoryList\n", - "neu,neu,neg,neu,neg\n", - "Path: wisesight_data;\n", - "\n", - "Test: None, model=SequentialRNN(\n", - " (0): MultiBatchEncoder(\n", - " (module): AWD_LSTM(\n", - " (encoder): Embedding(15000, 400, padding_idx=1)\n", - " (encoder_dp): EmbeddingDropout(\n", - " (emb): Embedding(15000, 400, padding_idx=1)\n", - " )\n", - " (rnns): ModuleList(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (2): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (3): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " )\n", - " (input_dp): RNNDropout()\n", - " (hidden_dps): ModuleList(\n", - " (0): RNNDropout()\n", - " (1): RNNDropout()\n", - " (2): RNNDropout()\n", - " (3): RNNDropout()\n", - " )\n", - " )\n", - " )\n", - " (1): PoolingLinearClassifier(\n", - " (layers): Sequential(\n", - " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (1): Dropout(p=0.27999999999999997)\n", - " (2): Linear(in_features=1200, out_features=50, bias=True)\n", - " (3): ReLU(inplace)\n", - " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (5): Dropout(p=0.1)\n", - " (6): Linear(in_features=50, out_features=4, bias=True)\n", - " )\n", - " )\n", - "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[...], layer_groups=[Sequential(\n", - " (0): Embedding(15000, 400, padding_idx=1)\n", - " (1): EmbeddingDropout(\n", - " (emb): Embedding(15000, 400, padding_idx=1)\n", - " )\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): PoolingLinearClassifier(\n", - " (layers): Sequential(\n", - " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (1): Dropout(p=0.27999999999999997)\n", - " (2): Linear(in_features=1200, out_features=50, bias=True)\n", - " (3): ReLU(inplace)\n", - " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (5): Dropout(p=0.1)\n", - " (6): Linear(in_features=50, out_features=4, bias=True)\n", - " )\n", - " )\n", - ")], add_time=True, silent=False, cb_fns_registered=False)\n", - "alpha: 2\n", - "beta: 1], layer_groups=[Sequential(\n", - " (0): Embedding(15000, 400, padding_idx=1)\n", - " (1): EmbeddingDropout(\n", - " (emb): Embedding(15000, 400, padding_idx=1)\n", - " )\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): PoolingLinearClassifier(\n", - " (layers): Sequential(\n", - " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (1): Dropout(p=0.27999999999999997)\n", - " (2): Linear(in_features=1200, out_features=50, bias=True)\n", - " (3): ReLU(inplace)\n", - " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (5): Dropout(p=0.1)\n", - " (6): Linear(in_features=50, out_features=4, bias=True)\n", - " )\n", - " )\n", - ")], add_time=True, silent=False, cb_fns_registered=False)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 40 - } + "text/plain": [ + "" ] + }, + "metadata": { + "tags": [] + }, + "output_type": "display_data" + } + ], + "source": [ + "#train unfrozen\n", + "print(\"training unfrozen\")\n", + "learn.unfreeze()\n", + "learn.fit_one_cycle(5, 1e-3, moms=(0.8, 0.7))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "OZC4BGnB6z6L" + }, + "outputs": [], + "source": [ + "# learn.save('wisesight_lm')\n", + "learn.save_encoder(\"wisesight_enc\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hTTQ76Ls6z6N" + }, + "source": [ + "### Train Text Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 }, + "colab_type": "code", + "id": "A2Z09Mf26z6N", + "outputId": "27cfec82-23e3-4e0b-dd0d-7b4ba855e646" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "pRgoPD766z6S", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# #train unfrozen\n", - "# learn.freeze_to(-1)\n", - "# learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))\n", - "# learn.freeze_to(-2)\n", - "# learn.fit_one_cycle(1, slice(1e-2 / (2.6 ** 4), 1e-2), moms=(0.8, 0.7))\n", - "# learn.freeze_to(-3)\n", - "# learn.fit_one_cycle(1, slice(5e-3 / (2.6 ** 4), 5e-3), moms=(0.8, 0.7))\n", - "# learn.unfreeze()\n", - "# learn.fit_one_cycle(10, slice(1e-3 / (2.6 ** 4), 1e-3), moms=(0.8, 0.7),\n", - "# callbacks=[SaveModelCallback(learn, every='improvement', monitor='accuracy', name='bestmodel')])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w54ZOwk66z6U", - "colab_type": "text" - }, - "source": [ - "Training takes about 20 minutes so we use the script `train_model.py` to do it with the following results (validation run):\n", - "\n", - "```\n", - "epoch train_loss valid_loss accuracy\n", - "1 0.812156 0.753478 0.687532\n", - "Total time: 00:56\n", - "epoch train_loss valid_loss accuracy\n", - "1 0.740403 0.699093 0.714394\n", - "Total time: 00:57\n", - "epoch train_loss valid_loss accuracy\n", - "1 0.727394 0.668807 0.723011\n", - "Total time: 01:34\n", - "epoch train_loss valid_loss accuracy\n", - "1 0.722163 0.675351 0.723517\n", - "2 0.675266 0.654477 0.738723\n", - "3 0.669178 0.641070 0.737962\n", - "4 0.612528 0.637456 0.744551\n", - "5 0.618259 0.635149 0.749366\n", - "6 0.572621 0.651169 0.749873\n", - "7 0.561985 0.661739 0.747593\n", - "8 0.534753 0.673563 0.738469\n", - "9 0.530844 0.688871 0.746072\n", - "10 0.522788 0.670024 0.743031\n", - "Total time: 23:42\n", - "```" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "15000\n" + ] + } + ], + "source": [ + "#lm data\n", + "data_lm = load_data(model_path, \"wisesight_lm.pkl\")\n", + "data_lm.sanity_check()\n", + "\n", + "#classification data\n", + "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", + "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", + " NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=20)]\n", + "\n", + "data_cls = (ItemLists(model_path,train=TextList.from_df(train_df, model_path, cols=[\"texts\"], processor=processor),\n", + " valid=TextList.from_df(valid_df, model_path, cols=[\"texts\"], processor=processor))\n", + " .label_from_df(\"category\")\n", + " .databunch(bs=50)\n", + " )\n", + "data_cls.sanity_check()\n", + "print(len(data_cls.vocab.itos))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "colab_type": "code", + "id": "RjRFWx8-6z6P", + "outputId": "76a4bb55-a3cd-4d51-d5fc-6c2fec878573" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "vM--oaCJ6z6V", - "colab_type": "text" - }, - "source": [ - "### See Results" + "data": { + "text/plain": [ + "RNNLearner(data=TextClasDataBunch;\n", + "\n", + "Train: LabelList (20453 items)\n", + "x: TextList\n", + "xxbos กันแดด คิว เพลส ตัวใหม่ นี่ คุม มัน ดีจริง อ่ะ นี่ หน้า มัน ยิ่ง ที โซน ยิ่ง มัน เยอะ นีเวีย หลอด ยาว ๆ ฝา เขียว ก็ เอา ไม่อยู่ อ่ะ แล้ว xxunk,xxbos พบ กับ การ ร่วม ตัว ของ ศิลปิน soul pop สาม ยุค สาม สไตล์ ใน งาน jamnight อะไร ก็ช่าง xxunk ( ชุ่ย ) นำ ทีม โดย soul after six , the parkinson และ the xxup toys งาน นี้ นอกจาก จะ ได้ ดู โชว์ แบบ เต็ม รูปแบบ จาก ทั้ง สาม วง แล้ว ยัง มี โชว์ สุด พิเศษ ที่ ทั้ง สาม จะ ร่วม แจม กัน ด้วย ไม่ อยาก ให้ พลาด เจอกัน วันที่ 29 กันยายน นี้ ที่ ช่าง ชุ่ย ประตู เปิด 19.00 น. เป็นต้นไป สามารถ ซื้อ บัตร ได้ แล้ว ที่ event pop : http : / / go . eventpop . me / jamnight * จำกัด ผู้ ที่ มีอายุ 20 ปี ขึ้นไป # jamnightbyjameson # jamesonthailand # soulaftersix # theparkinson # thetoys,xxbos 👌 🏻 👌 🏻 👌 🏻 xxwrep 2 😆,xxbos จ - ศ แถม ถึง 29 ไม่ทัน มะ ว้า xxrep 4 ,xxbos ใช้ ดี ค่ะ บอกต่อ คือ เป็น คน แพ้ ง่าย มาก กก ใช้ กา นิ เย หรือ พอน ก็ แพ้ แต่ ใช้ ครีม แตงโม แล้ว คือ ดี สิว ลด กันน้ำ ค่ะ นี้ ใช้ ไป เล่น สงกรานต์ มา รอด ค่ะ 555 เล่อ ค่า xxrep 5 \n", + "y: CategoryList\n", + "neg,neu,neu,neu,neg\n", + "Path: wisesight_data;\n", + "\n", + "Valid: LabelList (3610 items)\n", + "x: TextList\n", + "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", + "y: CategoryList\n", + "neu,neu,neg,neu,neg\n", + "Path: wisesight_data;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): MultiBatchEncoder(\n", + " (module): AWD_LSTM(\n", + " (encoder): Embedding(15000, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " )\n", + " (1): PoolingLinearClassifier(\n", + " (layers): Sequential(\n", + " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (1): Dropout(p=0.27999999999999997)\n", + " (2): Linear(in_features=1200, out_features=50, bias=True)\n", + " (3): ReLU(inplace)\n", + " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): Dropout(p=0.1)\n", + " (6): Linear(in_features=50, out_features=4, bias=True)\n", + " )\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[RNNTrainer\n", + "learn: RNNLearner(data=TextClasDataBunch;\n", + "\n", + "Train: LabelList (20453 items)\n", + "x: TextList\n", + "xxbos กันแดด คิว เพลส ตัวใหม่ นี่ คุม มัน ดีจริง อ่ะ นี่ หน้า มัน ยิ่ง ที โซน ยิ่ง มัน เยอะ นีเวีย หลอด ยาว ๆ ฝา เขียว ก็ เอา ไม่อยู่ อ่ะ แล้ว xxunk,xxbos พบ กับ การ ร่วม ตัว ของ ศิลปิน soul pop สาม ยุค สาม สไตล์ ใน งาน jamnight อะไร ก็ช่าง xxunk ( ชุ่ย ) นำ ทีม โดย soul after six , the parkinson และ the xxup toys งาน นี้ นอกจาก จะ ได้ ดู โชว์ แบบ เต็ม รูปแบบ จาก ทั้ง สาม วง แล้ว ยัง มี โชว์ สุด พิเศษ ที่ ทั้ง สาม จะ ร่วม แจม กัน ด้วย ไม่ อยาก ให้ พลาด เจอกัน วันที่ 29 กันยายน นี้ ที่ ช่าง ชุ่ย ประตู เปิด 19.00 น. เป็นต้นไป สามารถ ซื้อ บัตร ได้ แล้ว ที่ event pop : http : / / go . eventpop . me / jamnight * จำกัด ผู้ ที่ มีอายุ 20 ปี ขึ้นไป # jamnightbyjameson # jamesonthailand # soulaftersix # theparkinson # thetoys,xxbos 👌 🏻 👌 🏻 👌 🏻 xxwrep 2 😆,xxbos จ - ศ แถม ถึง 29 ไม่ทัน มะ ว้า xxrep 4 ,xxbos ใช้ ดี ค่ะ บอกต่อ คือ เป็น คน แพ้ ง่าย มาก กก ใช้ กา นิ เย หรือ พอน ก็ แพ้ แต่ ใช้ ครีม แตงโม แล้ว คือ ดี สิว ลด กันน้ำ ค่ะ นี้ ใช้ ไป เล่น สงกรานต์ มา รอด ค่ะ 555 เล่อ ค่า xxrep 5 \n", + "y: CategoryList\n", + "neg,neu,neu,neu,neg\n", + "Path: wisesight_data;\n", + "\n", + "Valid: LabelList (3610 items)\n", + "x: TextList\n", + "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", + "y: CategoryList\n", + "neu,neu,neg,neu,neg\n", + "Path: wisesight_data;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): MultiBatchEncoder(\n", + " (module): AWD_LSTM(\n", + " (encoder): Embedding(15000, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " )\n", + " (1): PoolingLinearClassifier(\n", + " (layers): Sequential(\n", + " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (1): Dropout(p=0.27999999999999997)\n", + " (2): Linear(in_features=1200, out_features=50, bias=True)\n", + " (3): ReLU(inplace)\n", + " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): Dropout(p=0.1)\n", + " (6): Linear(in_features=50, out_features=4, bias=True)\n", + " )\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[...], layer_groups=[Sequential(\n", + " (0): Embedding(15000, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): PoolingLinearClassifier(\n", + " (layers): Sequential(\n", + " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (1): Dropout(p=0.27999999999999997)\n", + " (2): Linear(in_features=1200, out_features=50, bias=True)\n", + " (3): ReLU(inplace)\n", + " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): Dropout(p=0.1)\n", + " (6): Linear(in_features=50, out_features=4, bias=True)\n", + " )\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)\n", + "alpha: 2\n", + "beta: 1], layer_groups=[Sequential(\n", + " (0): Embedding(15000, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): PoolingLinearClassifier(\n", + " (layers): Sequential(\n", + " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (1): Dropout(p=0.27999999999999997)\n", + " (2): Linear(in_features=1200, out_features=50, bias=True)\n", + " (3): ReLU(inplace)\n", + " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): Dropout(p=0.1)\n", + " (6): Linear(in_features=50, out_features=4, bias=True)\n", + " )\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)" ] - }, - { - "cell_type": "code", - "metadata": { - "id": "eOCe24KL6z6W", - "colab_type": "code", - "colab": {} - }, - "source": [ - "learn.load(\"bestmodel\")\n", - "\n", - "#get predictions\n", - "probs, y_true, loss = learn.get_preds(ds_type = DatasetType.Valid, ordered=True, with_loss=True)\n", - "classes = learn.data.train_ds.classes\n", - "y_true = np.array([classes[i] for i in y_true.numpy()])\n", - "preds = np.array([classes[i] for i in probs.argmax(1).numpy()])\n", - "prob = probs.numpy()\n", - "loss = loss.numpy()" - ], - "execution_count": 0, - "outputs": [] - }, + }, + "execution_count": 40, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "#model\n", + "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,\n", + " output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5)\n", + "trn_args = dict(bptt=70, drop_mult=0.7, alpha=2, beta=1, max_len=500)\n", + "\n", + "learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", + "#load pretrained finetuned model\n", + "learn.load_encoder(\"wisesight_enc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "pRgoPD766z6S" + }, + "outputs": [], + "source": [ + "# #train unfrozen\n", + "# learn.freeze_to(-1)\n", + "# learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))\n", + "# learn.freeze_to(-2)\n", + "# learn.fit_one_cycle(1, slice(1e-2 / (2.6 ** 4), 1e-2), moms=(0.8, 0.7))\n", + "# learn.freeze_to(-3)\n", + "# learn.fit_one_cycle(1, slice(5e-3 / (2.6 ** 4), 5e-3), moms=(0.8, 0.7))\n", + "# learn.unfreeze()\n", + "# learn.fit_one_cycle(10, slice(1e-3 / (2.6 ** 4), 1e-3), moms=(0.8, 0.7),\n", + "# callbacks=[SaveModelCallback(learn, every='improvement', monitor='accuracy', name='bestmodel')])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "w54ZOwk66z6U" + }, + "source": [ + "Training takes about 20 minutes so we use the script `train_model.py` to do it with the following results (validation run):\n", + "\n", + "```\n", + "epoch train_loss valid_loss accuracy\n", + "1 0.812156 0.753478 0.687532\n", + "Total time: 00:56\n", + "epoch train_loss valid_loss accuracy\n", + "1 0.740403 0.699093 0.714394\n", + "Total time: 00:57\n", + "epoch train_loss valid_loss accuracy\n", + "1 0.727394 0.668807 0.723011\n", + "Total time: 01:34\n", + "epoch train_loss valid_loss accuracy\n", + "1 0.722163 0.675351 0.723517\n", + "2 0.675266 0.654477 0.738723\n", + "3 0.669178 0.641070 0.737962\n", + "4 0.612528 0.637456 0.744551\n", + "5 0.618259 0.635149 0.749366\n", + "6 0.572621 0.651169 0.749873\n", + "7 0.561985 0.661739 0.747593\n", + "8 0.534753 0.673563 0.738469\n", + "9 0.530844 0.688871 0.746072\n", + "10 0.522788 0.670024 0.743031\n", + "Total time: 23:42\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "vM--oaCJ6z6V" + }, + "source": [ + "### See Results" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "eOCe24KL6z6W" + }, + "outputs": [], + "source": [ + "learn.load(\"bestmodel\")\n", + "\n", + "#get predictions\n", + "probs, y_true, loss = learn.get_preds(ds_type = DatasetType.Valid, ordered=True, with_loss=True)\n", + "classes = learn.data.train_ds.classes\n", + "y_true = np.array([classes[i] for i in y_true.numpy()])\n", + "preds = np.array([classes[i] for i in probs.argmax(1).numpy()])\n", + "prob = probs.numpy()\n", + "loss = loss.numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LaJYU8f56z6Z", + "outputId": "28603bc9-8cf5-4aba-cfee-836d4c6b5b91" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "LaJYU8f56z6Z", - "colab_type": "code", - "colab": {}, - "outputId": "28603bc9-8cf5-4aba-cfee-836d4c6b5b91" - }, - "source": [ - "to_df = np.concatenate([y_true[:,None],preds[:,None],loss[:,None],prob],1)\n", - "probs_df = pd.DataFrame(to_df)\n", - "probs_df.columns = [\"category\",\"preds\",\"loss\"] + classes\n", - "probs_df[\"hit\"] = (probs_df.category == probs_df.preds)\n", - "probs_df[\"texts\"] = valid_df.texts\n", - "(y_true==preds).mean()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "0.8392661555312158" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 32 - } + "data": { + "text/plain": [ + "0.8392661555312158" ] - }, + }, + "execution_count": 32, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "to_df = np.concatenate([y_true[:,None],preds[:,None],loss[:,None],prob],1)\n", + "probs_df = pd.DataFrame(to_df)\n", + "probs_df.columns = [\"category\",\"preds\",\"loss\"] + classes\n", + "probs_df[\"hit\"] = (probs_df.category == probs_df.preds)\n", + "probs_df[\"texts\"] = valid_df.texts\n", + "(y_true==preds).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "l_evHEMM6z6b", + "outputId": "732e91f4-a281-4a70-bf3b-8c6d43cad41a" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "l_evHEMM6z6b", - "colab_type": "code", - "colab": {}, - "outputId": "732e91f4-a281-4a70-bf3b-8c6d43cad41a" - }, - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "import seaborn as sns\n", - "\n", - "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", - "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", - " xticklabels=classes, yticklabels=classes)\n", - "plt.ylabel(\"Actual\")\n", - "plt.xlabel(\"Predicted\")\n", - "plt.show()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [], - "needs_background": "light" - } - } + "data": { + "image/png": "\n", + "text/plain": [ + "
" ] + }, + "metadata": { + "needs_background": "light", + "tags": [] + }, + "output_type": "display_data" } - ] -} \ No newline at end of file + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "import seaborn as sns\n", + "\n", + "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", + "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", + " xticklabels=classes, yticklabels=classes)\n", + "plt.ylabel(\"Actual\")\n", + "plt.xlabel(\"Predicted\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "sentiment_analysis.ipynb", + "provenance": [], + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/notebooks/text_generation.ipynb b/notebooks/text_generation.ipynb index f39407fcc..63918e2a6 100644 --- a/notebooks/text_generation.ipynb +++ b/notebooks/text_generation.ipynb @@ -1,638 +1,553 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "vfD07MBXKROC" + }, + "source": [ + "# Thai Wiki Language Model for Text Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BunBriX0KROF" + }, + "source": [ + "This notebook details how you can use pretrained language model on [Thai Wikipedia Dump](https://dumps.wikimedia.org/thwiki/latest/thwiki-latest-pages-articles.xml.bz2) to generate texts." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "name": "text_generation.ipynb", - "version": "0.3.2", - "provenance": [] + "base_uri": "https://localhost:8080/", + "height": 1000 }, - "accelerator": "GPU" + "colab_type": "code", + "id": "O8IVDoE9KROG", + "outputId": "935e7e3e-6f0d-4880-86b8-30df8e2eb853" + }, + "outputs": [], + "source": [ + "# #uncomment if you are running from google colab\n", + "# !pip install sklearn_crfsuite\n", + "# !pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "# !pip install fastai\n", + "# !pip install emoji" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "vfD07MBXKROC", - "colab_type": "text" - }, - "source": [ - "# Thai Wiki Language Model for Text Generation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BunBriX0KROF", - "colab_type": "text" - }, - "source": [ - "This notebook details how you can use pretrained language model on [Thai Wikipedia Dump](https://dumps.wikimedia.org/thwiki/latest/thwiki-latest-pages-articles.xml.bz2) to generate texts." - ] + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 }, + "colab_type": "code", + "id": "DvwUYZGmKROK", + "outputId": "03569098-5d70-4756-f8b4-c77de3cd4b5c" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "O8IVDoE9KROG", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "outputId": "935e7e3e-6f0d-4880-86b8-30df8e2eb853" - }, - "source": [ - "#uncomment if you are running from google colab\n", - "!pip install sklearn_crfsuite\n", - "!pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", - "!pip install fastai\n", - "!pip install emoji" - ], - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Requirement already satisfied: sklearn_crfsuite in /usr/local/lib/python3.6/dist-packages (0.3.6)\n", - "Requirement already satisfied: python-crfsuite>=0.8.3 in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (0.9.6)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (1.12.0)\n", - "Requirement already satisfied: tqdm>=2.0 in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (4.28.1)\n", - "Requirement already satisfied: tabulate in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (0.8.3)\n", - "Collecting https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", - "\u001b[?25l Downloading https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", - "\u001b[K - 11.2MB 218kB/s\n", - "\u001b[?25hRequirement already satisfied (use --upgrade to upgrade): pythainlp==2.1.dev2 from https://github.com/PyThaiNLP/pythainlp/archive/dev.zip in /usr/local/lib/python3.6/dist-packages\n", - "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (0.3.0)\n", - "Requirement already satisfied: marisa-trie==0.7.4 in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (0.7.4)\n", - "Requirement already satisfied: nltk>=3.2.2 in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (3.2.5)\n", - "Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (2018.9)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (2.21.0)\n", - "Requirement already satisfied: tinydb in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (3.13.0)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (4.28.1)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk>=3.2.2->pythainlp==2.1.dev2) (1.12.0)\n", - "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (2.8)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (2019.6.16)\n", - "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (1.24.3)\n", - "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (3.0.4)\n", - "Building wheels for collected packages: pythainlp\n", - " Building wheel for pythainlp (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for pythainlp: filename=pythainlp-2.1.dev2-cp36-none-any.whl size=11014043 sha256=3dfa6501ae5079e51204d5ab850ab32965c85f27bb642a67712b39b106feb3fc\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-5gfc5rda/wheels/79/4e/1e/26f3198c6712ecfbee92928ed1dde923a078da3d222401cc78\n", - "Successfully built pythainlp\n", - "Requirement already satisfied: fastai in /usr/local/lib/python3.6/dist-packages (1.0.57)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from fastai) (3.13)\n", - "Requirement already satisfied: spacy>=2.0.18 in /usr/local/lib/python3.6/dist-packages (from fastai) (2.1.8)\n", - "Requirement already satisfied: typing; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from fastai) (3.7.4)\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from fastai) (1.3.1)\n", - "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.6/dist-packages (from fastai) (4.6.3)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from fastai) (19.1)\n", - "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from fastai) (3.0.3)\n", - "Requirement already satisfied: Pillow in /usr/local/lib/python3.6/dist-packages (from fastai) (4.3.0)\n", - "Requirement already satisfied: torch>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from fastai) (1.1.0)\n", - "Requirement already satisfied: fastprogress>=0.1.19 in /usr/local/lib/python3.6/dist-packages (from fastai) (0.1.21)\n", - "Requirement already satisfied: torchvision in /usr/local/lib/python3.6/dist-packages (from fastai) (0.3.0)\n", - "Requirement already satisfied: nvidia-ml-py3 in /usr/local/lib/python3.6/dist-packages (from fastai) (7.352.0)\n", - "Requirement already satisfied: bottleneck in /usr/local/lib/python3.6/dist-packages (from fastai) (1.2.1)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from fastai) (2.21.0)\n", - "Requirement already satisfied: numexpr in /usr/local/lib/python3.6/dist-packages (from fastai) (2.6.9)\n", - "Requirement already satisfied: numpy>=1.15 in /usr/local/lib/python3.6/dist-packages (from fastai) (1.16.4)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from fastai) (0.24.2)\n", - "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from fastai) (0.6)\n", - "Requirement already satisfied: wasabi<1.1.0,>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (0.2.2)\n", - "Requirement already satisfied: plac<1.0.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (0.9.6)\n", - "Requirement already satisfied: blis<0.3.0,>=0.2.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (0.2.4)\n", - "Requirement already satisfied: srsly<1.1.0,>=0.0.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (0.0.7)\n", - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (1.0.2)\n", - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (2.0.2)\n", - "Requirement already satisfied: thinc<7.1.0,>=7.0.8 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (7.0.8)\n", - "Requirement already satisfied: preshed<2.1.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (2.0.1)\n", - "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->fastai) (2.4.2)\n", - "Requirement already satisfied: attrs in /usr/local/lib/python3.6/dist-packages (from packaging->fastai) (19.1.0)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from packaging->fastai) (1.12.0)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->fastai) (0.10.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->fastai) (1.1.0)\n", - "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->fastai) (2.5.3)\n", - "Requirement already satisfied: olefile in /usr/local/lib/python3.6/dist-packages (from Pillow->fastai) (0.46)\n", - "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->fastai) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->fastai) (2019.6.16)\n", - "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->fastai) (2.8)\n", - "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->fastai) (3.0.4)\n", - "Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas->fastai) (2018.9)\n", - "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in /usr/local/lib/python3.6/dist-packages (from thinc<7.1.0,>=7.0.8->spacy>=2.0.18->fastai) (4.28.1)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib->fastai) (41.0.1)\n", - "Collecting emoji\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/1b/d7/2746b4dd67375ce253e777ba54869545d24d2b0249ebcf83735c99df68d5/emoji-0.5.3.tar.gz (43kB)\n", - "\u001b[K |████████████████████████████████| 51kB 4.4MB/s \n", - "\u001b[?25hBuilding wheels for collected packages: emoji\n", - " Building wheel for emoji (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for emoji: filename=emoji-0.5.3-cp36-none-any.whl size=42175 sha256=c3f1611ca03c91684bc818c0ad78dcb8d0542c7eab7fc3dfe3a6640090c8f196\n", - " Stored in directory: /root/.cache/pip/wheels/86/09/26/f944015841423cd516e8a97f30e29be59e53461aea8b7d3458\n", - "Successfully built emoji\n", - "Installing collected packages: emoji\n", - "Successfully installed emoji-0.5.3\n" - ], - "name": "stdout" - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Download: wiki_lm_lstm\n", + "wiki_lm_lstm 0.32\n" + ] }, { - "cell_type": "code", - "metadata": { - "id": "DvwUYZGmKROK", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 119 - }, - "outputId": "03569098-5d70-4756-f8b4-c77de3cd4b5c" - }, - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from ast import literal_eval\n", - "from tqdm import tqdm_notebook\n", - "from collections import Counter\n", - "import re\n", - "\n", - "#viz\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "#fastai\n", - "import fastai\n", - "from fastai.text import *\n", - "from fastai.callbacks import CSVLogger\n", - "\n", - "#pythainlp\n", - "from pythainlp.ulmfit import *" - ], - "execution_count": 4, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Download: wiki_lm_lstm\n", - "wiki_lm_lstm 0.32\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "100%|██████████| 1050919089/1050919089 [00:25<00:00, 41157162.35it/s]\n" - ], - "name": "stderr" - }, - { - "output_type": "stream", - "text": [ - "Download: wiki_itos_lstm\n", - "wiki_itos_lstm 0.32\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "100%|██████████| 1530484/1530484 [00:00<00:00, 19090275.60it/s]\n" - ], - "name": "stderr" - } - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1050919089/1050919089 [00:25<00:00, 41157162.35it/s]\n" + ] }, { - "cell_type": "code", - "metadata": { - "id": "PnQcr3gWKROS", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#get dummy data\n", - "imdb = untar_data(URLs.IMDB_SAMPLE)\n", - "dummy_df = pd.read_csv(imdb/'texts.csv')\n", - "\n", - "#get vocab\n", - "thwiki_itos = pickle.load(open(_THWIKI_LSTM['itos_fname'],'rb'))\n", - "thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)\n", - "\n", - "#dummy databunch\n", - "tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)\n", - "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", - " NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)]\n", - "data_lm = (TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor)\n", - " .random_split_by_pct(0.2)\n", - " .label_for_lm()\n", - " .databunch(bs=64))\n", - "\n", - "\n", - "data_lm.sanity_check()" - ], - "execution_count": 0, - "outputs": [] + "name": "stdout", + "output_type": "stream", + "text": [ + "Download: wiki_itos_lstm\n", + "wiki_itos_lstm 0.32\n" + ] }, { - "cell_type": "code", - "metadata": { - "id": "VJI1MZzvKROW", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "f8db8372-fc6a-44ff-f7cd-9e4e8d99684b" - }, - "source": [ - "#check vocab size\n", - "len(data_lm.vocab.itos)" - ], - "execution_count": 6, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "60005" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 6 - } - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1530484/1530484 [00:00<00:00, 19090275.60it/s]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from ast import literal_eval\n", + "from tqdm import tqdm_notebook\n", + "from collections import Counter\n", + "import re\n", + "\n", + "#viz\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "#fastai\n", + "import fastai\n", + "from fastai.text import *\n", + "from fastai.callbacks import CSVLogger\n", + "\n", + "#pythainlp\n", + "from pythainlp.ulmfit import *" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PnQcr3gWKROS" + }, + "outputs": [], + "source": [ + "#get dummy data\n", + "imdb = untar_data(URLs.IMDB_SAMPLE)\n", + "dummy_df = pd.read_csv(imdb/'texts.csv')\n", + "\n", + "#get vocab\n", + "thwiki_itos = pickle.load(open(_THWIKI_LSTM['itos_fname'],'rb'))\n", + "thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)\n", + "\n", + "#dummy databunch\n", + "tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)\n", + "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", + " NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)]\n", + "data_lm = (TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor)\n", + " .random_split_by_pct(0.2)\n", + " .label_for_lm()\n", + " .databunch(bs=64))\n", + "\n", + "\n", + "data_lm.sanity_check()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 }, + "colab_type": "code", + "id": "VJI1MZzvKROW", + "outputId": "f8db8372-fc6a-44ff-f7cd-9e4e8d99684b" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "B9DJVRZ-KROb", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "outputId": "54031979-d708-4550-d0b7-8cd42b07cf96" - }, - "source": [ - "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,\n", - " output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)\n", - "trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)\n", - "\n", - "learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", - "\n", - "#load pretrained models\n", - "learn.load_pretrained(**_THWIKI_LSTM)" - ], - "execution_count": 7, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "LanguageLearner(data=TextLMDataBunch;\n", - "\n", - "Train: LabelList (800 items)\n", - "x: LMTextList\n", - "xxbos every once in a long while a movie will come along that will be so xxunk that i feel xxunk to xxunk people . if i labor all my days and i can save but one soul from xxunk this movie , how great will be my joy . \n", - " \n", - " where to begin my xxunk of pain . for xxunk , there was a musical xxunk every five minutes . there was no character development . every character was a stereotype . we had xxunk guy , fat guy who xxunk xxunk , xxunk foreign guy , etc . the script xxunk as if it were being written as the movie was being shot . the production value was so xxunk low that it xxunk like i was xxunk a junior high video presentation . have the directors , producers , etc . ever even seen a movie before ? xxunk is getting worse and worse with every new entry . the concept for this movie xxunk so funny . how could you go wrong with gary coleman and a xxunk of xxunk xxunk actors . but trust me when i say this , things went wrong , xxup very xxup wrong .,xxbos name just says it all . i xxunk this movie with my dad when it came out and having xxunk in korea he had great xxunk for the man . the xxunk thing about this film is that it only xxunk on a short period of the man ' s life - xxunk enough the man ' s entire life would have made such an epic xxunk that it is xxunk to imagine the cost for production . \n", - " \n", - " some xxunk xxunk to the xxunk xxunk about the man , which are cheap xxunk . the theme of the movie \" duty , honor , country \" are not just xxunk words xxunk from the lips of a xxunk officer - it is the deep declaration of one man ' s total xxunk to his country . \n", - " \n", - " xxunk peck being the liberal that he was xxunk a better understanding of the man . he does a great job xxunk the fearless general xxunk with the humane side of the man .,xxbos this movie xxunk at being one of the most unique movies you ' ve seen . however this comes from the fact that you can ' t make heads or tails of this mess . it almost xxunk as a series of challenges set up to xxunk xxunk or not you are xxunk to walk out of the movie and give up the money you just paid . if you don ' t want to feel xxunk you ' ll sit through this xxunk film and xxunk a real sense of xxunk for the actors xxunk , they ' ve all seen better days , but then you xxunk they xxunk got paid xxunk a bit of money to do this and you ' ll lose xxunk for them just like you ' ve xxunk done for the film . i can ' t go on enough about this xxunk movie , its almost something that ed wood would have made and in that case it xxunk would have been his xxunk . \n", - " \n", - " to start you are forced to sit through an opening dialogue the xxunk of which you ' ve never seen / heard , this thing has got to be five minutes long . on top of that it is xxunk , as to xxunk that you the viewer cannot read . then we meet mr . xxunk and the xxunk of xxunk lines gets xxunk , it is as if he is operating xxunk to get lines on to the movie xxunk tag line . soon we meet stephen xxunk , who i xxunk xxunk ) and he does his best not to xxunk in this but xxunk he does . then comes the ultimate xxunk , tara reid playing an intelligent role , oh help us ! tara reid is not a very talented actress and xxunk she xxunk gets xxunk in movies , in my xxunk though she should stick to movies of the american pie type . \n", - " \n", - " all in all you just may want to see this for yourself when it comes out on video , i know that i got a kick out of it , i mean xxunk all be xxunk here , xxunk its xxunk to xxunk in the xxunk of others .,xxbos from the start , you know how this movie will end . it ' s so full of clich é s your typical xxup xxunk member will not even like this movie . i give it 2 out of 10, only because of the acting of william benton . i can ' t believe people xxunk 6 + for this movie . it ' s so biased towards a ' certain point of view ' ( once a thief xxunk people xxunk ' t born bad . neither are they born good . they are born with a clean slate . it ' s society , parents and education what makes them who they are . and if they take the wrong turn , somewhere down the line , it xxunk isn ' t going to be the american xxunk system that gets them back on track ! xxunk , xxunk this movie like the xxunk . i bet you have better things to do with your time than waste it on this piece of xxunk . \n", - " \n", - " ,xxbos i was xxunk enough to meet george pal ( and still have my xxup ds : xxup xxunk xxunk xxunk by him ) at a convention xxunk after the release , and xxunk him why he xxunk to do the film \" camp \". before he could answer , two studio xxunk xxunk and xxunk me on how the studio \" knew best \" and how \" no one will take such a film xxunk \". i had been reading the xxunk xxunk for a couple of years thanks to a friend ( xxunk xxunk of the 1970 s will recall xxunk and his band ? i was in a couple of years of that with him ), and had higher hopes than what we got . \n", - " \n", - " the xxunk xxunk that no high adventure would ever be done xxunk , and so doing ' camp ' was the only way . several other xxunk xxunk in on my side , with pal listening as best he could . at the end of the little event , pal came up to us and xxunk , xxunk he could have done more and better . \n", - " \n", - " xxup star xxup wars put the lie to the xxunk , and a year after pal ' s death , spielberg and lucas xxunk that doc savage could have xxunk been the next major movie franchise xxunk if it xxunk ' t been for the xxunk . \n", - " \n", - " tear out the memory or history of doc , and the film would have been worth a 6 / 10 rating as nothing more than a xxunk xxunk seller . \n", - " \n", - " but xxunk the legacy like that was no less an xxunk than killing a baby in the xxunk . \n", - " \n", - " doc savage can still come to the screen , and survive the xxunk xxunk by the xxunk to indiana jones , but it would have to be done in all xxunk and xxunk to xxunk the glory that we should expect from the first american xxunk . \n", - " \n", - " xxup xxunk : yes , there was a second script for xxup xxunk xxup of xxup evil , and it ' s a lot more serious . yes , there was xxunk xxunk shot , but mostly xxunk xxunk and very little with actors . and , yes , there _ is _ a xxunk of ron xxunk xxunk over a brick wall and xxunk at something over his shoulder with a xxunk built bronze xxunk . xxunk ' s xxunk a xxunk over a button down white shirt with a bronze tie , and the words \" xxup doc xxup savage : xxup xxunk xxup of xxup evil xxunk coming next summer !\" xxup xxunk : if anyone knows who the studio xxunk were that xxunk george pal in 1975 to san diego for the convention , xxunk the xxunk up the side of the head and call them the xxunk that they are . at the time , they were doing xxunk and fu xxunk in stripes and xxunk canvas xxunk , and carrying paramount xxunk .\n", - "y: LMLabelList\n", - ",,,,\n", - "Path: /root/.fastai/data/imdb_sample;\n", - "\n", - "Valid: LabelList (200 items)\n", - "x: LMTextList\n", - "xxbos does any one know what the 2 sports cars were ? i think robert stack ' s might have been a xxunk . rock hudson ' s character told his father he was taking a job in iraq , isn ' t that xxunk ? i have had xxunk malone in my xxunk bank most of my life , maybe this was the film that xxunk me . xxunk xxunk sure did have some xxunk in this film and xxunk xxunk malone but xxunk ' s part made a more xxunk impact so she got the oscar for best supporting role . was xxunk ' s part xxunk a leading role ? old man xxunk character was was xxunk a pretty common picture of xxunk of his era in that he was a regular guy who made it big in an emerging industry but in building a whole town he had xxunk his children to have his wife bring them up . in time , being xxunk he xxunk that they were all he really had and they were xxunk rotten , looking for attention , so rather than try to xxunk to his children he xxunk his head off . an ancient morality tale . but xxunk , what were those sports cars ?,xxbos god bless 80 ' s xxunk films . this is a fun , fun movie . this is what xxunk films are all about . now i ' m not xxunk horror movies , just xxunk films . it goes like this : a high school nerd is xxunk on by all these stupid xxunk and xxunk , and then one of their xxunk goes xxunk wrong . xxunk and back for revenge , xxunk a joker / xxunk mask ( pretty xxunk looking , might i add ), marty begins to kill off those xxunk one by one many years later , after he xxunk to make them believe that their old xxunk high school is having a reunion . that is xxunk the plot ? what ' s wrong with that ? that ' s the beauty of 80 ' s xxunk films , most of them i would say . a lot of things could be so xxunk , but they keep drawing you more in an ' in as they go by . xxunk this film . \n", - " \n", - " it features some xxunk xxunk , and some are xxunk creative as well . ( poisoning of a xxunk can , acid bath , i can ' t remember a xxunk ever being used before in any other xxunk film either ) it really is a fun , fun movie . that ' s all it is . nevermind the fact that the characters are complete xxunk , never mind their xxunk , and never mind the xxunk , random things that xxunk in this film . such as lights being able to be controlled by the killer ( when he ' s not even switching any xxunk , you ' ll see ) and xxunk being able to xxunk up blood , xxunk being able to have acid come out of them , just use that as part of your entertainment ! because xxunk what really makes it xxunk . \n", - " \n", - " movies like this represent 80 ' s xxunk . never again could movies like this get made , know why ? it isn ' t the 80 ' s xxunk . that is why you should just xxunk them for what they are , good fun ! i highly xxunk this film if you ' re a hardcore fan of xxunk such as friday the 13 th . \n", - " \n", - " one last note this movie also had a kick ass xxunk as well , marty xxunk . a xxunk , nerd , who kills all his old xxunk in a xxunk xxunk mask . a good xxunk makes a good xxunk . simon xxunk , who played marty xxunk xxunk suicide xxunk after xxunk high was released . that alone xxunk something xxunk to the film , and sticks with it and it even makes you feel more sorry for the marty character , i guess . all in all , great 80 ' s xxunk fun ! it ' s a shame it will never be the same again xxunk,xxbos the basic formula for the original series was ; take someone , get the audience to like them , then put them into xxunk danger . this formula xxunk for the 32 xxunk made between 1964 - 68. \n", - " \n", - " now , we jump forward 40 years to xxunk we are xxunk to alan tracy , a xxunk xxunk college school kid , with his friend , fermat , a young xxunk . they are xxunk off by lady xxunk in her pink ford xxunk to the island paradise where the tracy family live , for the school xxunk . almost xxunk , they are left in the care of xxunk and his daughter , xxunk xxunk the xxunk go to rescue john from xxunk 5 which has been xxunk by a xxunk xxunk . this is all part of the hood ' s scheme to take over tracy island so that he can steal the xxunk machines xxunk \n", - " \n", - " xxunk to rob a bank ! \n", - " \n", - " yes . the plot xxup is as xxunk as that ! \n", - " \n", - " the dialogue is xxunk , the acting more wooden than that of the ( xxunk ) puppets , the effects , anything but special and hans xxunk ' s score xxunk what little there was of barry gray ' s glorious theme xxunk through xxunk ' s xxunk xxunk . the rest of the score was xxunk xxunk . in fact , part of the score was broadcast the following week on the radio and didn ' t xxunk it ! i didn ' t even xxunk to stay to witness xxunk ' s xxunk xxunk with the end titles \n", - " \n", - " to be fair , ron cook xxunk xxunk well as parker , he and sophia myles as xxunk xxunk xxunk . with the right material , they could have been show xxunk . the xxup cgi work was what i would have called leading edge - 5 years ago . \n", - " \n", - " the dynamics of the main craft were just wrong ; the original series models at least xxunk as if they had mass \n", - " \n", - " another xxunk point is that the whole production xxunk to be one long set of product xxunk , from every vehicle being built by ford to the entire content of the tracy xxunk being produced by ben & jerry ' s . \n", - " \n", - " my son ( 9 ) xxunk the film but this cross between spy kids and ' xxunk ', xxunk xxunk at his age group , added nothing to the xxunk legend . when star trek hit the big screen in 1979 with ' the motion picture ', a whole new xxunk of life was xxunk into the franchise which then continued for another 20 years or so . with this film , xxunk has xxunk a golden opportunity to do the same with the xxunk franchise . \n", - " \n", - " i xxunk that this film , like ' the avengers ' and ' the saint ' before it , will sink into xxunk within 6 months , leaving the original series to its ' classic ' status .,xxbos the views of earth that are xxunk in this film to have been xxunk by xxup nasa have xxunk been xxunk with the historical weather data for the time of apollo 11, and show a good match between the cloud patterns in the video sequence and the xxunk xxunk records on the day . \n", - " \n", - " this would xxunk to xxunk the entire argument put forward in the film that the \" whole earth \" picture is xxunk a small part of the planet framed by the spacecraft window . \n", - " \n", - " i am waiting for bart xxunk to now xxunk that the historical weather data has been xxunk by xxup nasa , though that would no doubt xxunk them in also xxunk every xxunk newspaper copy with a weather map , and the ones in private hands would still be a problem . \n", - " \n", - " ah , a response : \" xxunk to xxunk this movie by xxunk to xxup nasa weather data i ' d say is a xxunk , but weak and xxunk argument . what about the rest of the xxunk and xxunk in the movie ? a certain wise man once said something about xxunk xxunk and xxunk xxunk . do you in any way feel that maybe this could xxunk to what you are xxunk to do here ? : - ) this movie is just packed with xxunk evidence against the xxunk once made by u . s . government that the xxunk were a success , and that man now are true masters of the universe . things are xxunk never xxunk what they xxunk .. just watch the movie , and i dear say you ' ll see things a bit different than before .\" \n", - " \n", - " first off , weather data doesn ' t come from xxup nasa , it comes for met xxunk around the world . second , the weather data xxunk a major xxunk in the film . third , far from being \" packed with xxunk evidence \", the remaining xxunk in the film have been xxunk xxunk . xxunk thought he had a xxunk secret piece of film , so he edited it and added his own interpretation . xxunk for him , his source film is public domain , and the xxunk xxunk edited out xxunk his xxunk .,xxbos xxup swing ! is an important film because it ' s one of the remaining xxunk and xxunk films from the 1930 s . many of these films have simply xxunk so xxunk that they are xxunk , but this one is in xxunk good shape . it ' s also a nice chance to see many of the talented black xxunk of the period just after the xxunk of the old cotton xxunk time all but xxunk today . \n", - " \n", - " xxunk , while the film is xxunk important and has some lovely performances , it ' s also a mess . the main plot is very similar to the hollywood xxunk of the xxunk a prima donna who is going to xxunk the show and the surprise unknown who xxunk from no where to save the day . however , the writing is just xxunk and a bit xxunk at xxunk projects images of black america that some might find a bit xxunk . this is because before the plot really gets going , you are xxunk to a xxunk xxunk who lives off his hard working wife ( a popular stereotype of the time ) and when he is xxunk with a xxunk ( who , by the way , xxunk xxunk this role ), they have a fight which xxunk like a scene from xxup wwe smackdown ! and , the one lady wants to cut the other lady with a straight xxunk xxunk scene xxunk ! later in the film , when the prima donna is xxunk xxunk , her husband xxunk her in the face and everyone xxunk him ! it xxunk like the film , at times , wants to appeal to the xxunk common xxunk in the audience xxup plus they can ' t even do this xxunk some of the worst acting i ' ve seen in a very long time . \n", - " \n", - " still , if you can look past a xxunk production in just about every way ( with xxunk characters , bad acting and direction and poor writing ), this one might be worth a xxunk so you can see excellent singing and tap xxunk well as to catch a xxunk of xxunk black culture . just don ' t say i didn ' t xxunk you about the xxunk ' s really , really bad !\n", - "y: LMLabelList\n", - ",,,,\n", - "Path: /root/.fastai/data/imdb_sample;\n", - "\n", - "Test: None, model=SequentialRNN(\n", - " (0): AWD_LSTM(\n", - " (encoder): Embedding(60005, 400, padding_idx=1)\n", - " (encoder_dp): EmbeddingDropout(\n", - " (emb): Embedding(60005, 400, padding_idx=1)\n", - " )\n", - " (rnns): ModuleList(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (2): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (3): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " )\n", - " (input_dp): RNNDropout()\n", - " (hidden_dps): ModuleList(\n", - " (0): RNNDropout()\n", - " (1): RNNDropout()\n", - " (2): RNNDropout()\n", - " (3): RNNDropout()\n", - " )\n", - " )\n", - " (1): LinearDecoder(\n", - " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", - " (output_dp): RNNDropout()\n", - " )\n", - "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('/root/.fastai/data/imdb_sample'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[RNNTrainer\n", - "learn: LanguageLearner(data=TextLMDataBunch;\n", - "\n", - "Train: LabelList (800 items)\n", - "x: LMTextList\n", - "xxbos every once in a long while a movie will come along that will be so xxunk that i feel xxunk to xxunk people . if i labor all my days and i can save but one soul from xxunk this movie , how great will be my joy . \n", - " \n", - " where to begin my xxunk of pain . for xxunk , there was a musical xxunk every five minutes . there was no character development . every character was a stereotype . we had xxunk guy , fat guy who xxunk xxunk , xxunk foreign guy , etc . the script xxunk as if it were being written as the movie was being shot . the production value was so xxunk low that it xxunk like i was xxunk a junior high video presentation . have the directors , producers , etc . ever even seen a movie before ? xxunk is getting worse and worse with every new entry . the concept for this movie xxunk so funny . how could you go wrong with gary coleman and a xxunk of xxunk xxunk actors . but trust me when i say this , things went wrong , xxup very xxup wrong .,xxbos name just says it all . i xxunk this movie with my dad when it came out and having xxunk in korea he had great xxunk for the man . the xxunk thing about this film is that it only xxunk on a short period of the man ' s life - xxunk enough the man ' s entire life would have made such an epic xxunk that it is xxunk to imagine the cost for production . \n", - " \n", - " some xxunk xxunk to the xxunk xxunk about the man , which are cheap xxunk . the theme of the movie \" duty , honor , country \" are not just xxunk words xxunk from the lips of a xxunk officer - it is the deep declaration of one man ' s total xxunk to his country . \n", - " \n", - " xxunk peck being the liberal that he was xxunk a better understanding of the man . he does a great job xxunk the fearless general xxunk with the humane side of the man .,xxbos this movie xxunk at being one of the most unique movies you ' ve seen . however this comes from the fact that you can ' t make heads or tails of this mess . it almost xxunk as a series of challenges set up to xxunk xxunk or not you are xxunk to walk out of the movie and give up the money you just paid . if you don ' t want to feel xxunk you ' ll sit through this xxunk film and xxunk a real sense of xxunk for the actors xxunk , they ' ve all seen better days , but then you xxunk they xxunk got paid xxunk a bit of money to do this and you ' ll lose xxunk for them just like you ' ve xxunk done for the film . i can ' t go on enough about this xxunk movie , its almost something that ed wood would have made and in that case it xxunk would have been his xxunk . \n", - " \n", - " to start you are forced to sit through an opening dialogue the xxunk of which you ' ve never seen / heard , this thing has got to be five minutes long . on top of that it is xxunk , as to xxunk that you the viewer cannot read . then we meet mr . xxunk and the xxunk of xxunk lines gets xxunk , it is as if he is operating xxunk to get lines on to the movie xxunk tag line . soon we meet stephen xxunk , who i xxunk xxunk ) and he does his best not to xxunk in this but xxunk he does . then comes the ultimate xxunk , tara reid playing an intelligent role , oh help us ! tara reid is not a very talented actress and xxunk she xxunk gets xxunk in movies , in my xxunk though she should stick to movies of the american pie type . \n", - " \n", - " all in all you just may want to see this for yourself when it comes out on video , i know that i got a kick out of it , i mean xxunk all be xxunk here , xxunk its xxunk to xxunk in the xxunk of others .,xxbos from the start , you know how this movie will end . it ' s so full of clich é s your typical xxup xxunk member will not even like this movie . i give it 2 out of 10, only because of the acting of william benton . i can ' t believe people xxunk 6 + for this movie . it ' s so biased towards a ' certain point of view ' ( once a thief xxunk people xxunk ' t born bad . neither are they born good . they are born with a clean slate . it ' s society , parents and education what makes them who they are . and if they take the wrong turn , somewhere down the line , it xxunk isn ' t going to be the american xxunk system that gets them back on track ! xxunk , xxunk this movie like the xxunk . i bet you have better things to do with your time than waste it on this piece of xxunk . \n", - " \n", - " ,xxbos i was xxunk enough to meet george pal ( and still have my xxup ds : xxup xxunk xxunk xxunk by him ) at a convention xxunk after the release , and xxunk him why he xxunk to do the film \" camp \". before he could answer , two studio xxunk xxunk and xxunk me on how the studio \" knew best \" and how \" no one will take such a film xxunk \". i had been reading the xxunk xxunk for a couple of years thanks to a friend ( xxunk xxunk of the 1970 s will recall xxunk and his band ? i was in a couple of years of that with him ), and had higher hopes than what we got . \n", - " \n", - " the xxunk xxunk that no high adventure would ever be done xxunk , and so doing ' camp ' was the only way . several other xxunk xxunk in on my side , with pal listening as best he could . at the end of the little event , pal came up to us and xxunk , xxunk he could have done more and better . \n", - " \n", - " xxup star xxup wars put the lie to the xxunk , and a year after pal ' s death , spielberg and lucas xxunk that doc savage could have xxunk been the next major movie franchise xxunk if it xxunk ' t been for the xxunk . \n", - " \n", - " tear out the memory or history of doc , and the film would have been worth a 6 / 10 rating as nothing more than a xxunk xxunk seller . \n", - " \n", - " but xxunk the legacy like that was no less an xxunk than killing a baby in the xxunk . \n", - " \n", - " doc savage can still come to the screen , and survive the xxunk xxunk by the xxunk to indiana jones , but it would have to be done in all xxunk and xxunk to xxunk the glory that we should expect from the first american xxunk . \n", - " \n", - " xxup xxunk : yes , there was a second script for xxup xxunk xxup of xxup evil , and it ' s a lot more serious . yes , there was xxunk xxunk shot , but mostly xxunk xxunk and very little with actors . and , yes , there _ is _ a xxunk of ron xxunk xxunk over a brick wall and xxunk at something over his shoulder with a xxunk built bronze xxunk . xxunk ' s xxunk a xxunk over a button down white shirt with a bronze tie , and the words \" xxup doc xxup savage : xxup xxunk xxup of xxup evil xxunk coming next summer !\" xxup xxunk : if anyone knows who the studio xxunk were that xxunk george pal in 1975 to san diego for the convention , xxunk the xxunk up the side of the head and call them the xxunk that they are . at the time , they were doing xxunk and fu xxunk in stripes and xxunk canvas xxunk , and carrying paramount xxunk .\n", - "y: LMLabelList\n", - ",,,,\n", - "Path: /root/.fastai/data/imdb_sample;\n", - "\n", - "Valid: LabelList (200 items)\n", - "x: LMTextList\n", - "xxbos does any one know what the 2 sports cars were ? i think robert stack ' s might have been a xxunk . rock hudson ' s character told his father he was taking a job in iraq , isn ' t that xxunk ? i have had xxunk malone in my xxunk bank most of my life , maybe this was the film that xxunk me . xxunk xxunk sure did have some xxunk in this film and xxunk xxunk malone but xxunk ' s part made a more xxunk impact so she got the oscar for best supporting role . was xxunk ' s part xxunk a leading role ? old man xxunk character was was xxunk a pretty common picture of xxunk of his era in that he was a regular guy who made it big in an emerging industry but in building a whole town he had xxunk his children to have his wife bring them up . in time , being xxunk he xxunk that they were all he really had and they were xxunk rotten , looking for attention , so rather than try to xxunk to his children he xxunk his head off . an ancient morality tale . but xxunk , what were those sports cars ?,xxbos god bless 80 ' s xxunk films . this is a fun , fun movie . this is what xxunk films are all about . now i ' m not xxunk horror movies , just xxunk films . it goes like this : a high school nerd is xxunk on by all these stupid xxunk and xxunk , and then one of their xxunk goes xxunk wrong . xxunk and back for revenge , xxunk a joker / xxunk mask ( pretty xxunk looking , might i add ), marty begins to kill off those xxunk one by one many years later , after he xxunk to make them believe that their old xxunk high school is having a reunion . that is xxunk the plot ? what ' s wrong with that ? that ' s the beauty of 80 ' s xxunk films , most of them i would say . a lot of things could be so xxunk , but they keep drawing you more in an ' in as they go by . xxunk this film . \n", - " \n", - " it features some xxunk xxunk , and some are xxunk creative as well . ( poisoning of a xxunk can , acid bath , i can ' t remember a xxunk ever being used before in any other xxunk film either ) it really is a fun , fun movie . that ' s all it is . nevermind the fact that the characters are complete xxunk , never mind their xxunk , and never mind the xxunk , random things that xxunk in this film . such as lights being able to be controlled by the killer ( when he ' s not even switching any xxunk , you ' ll see ) and xxunk being able to xxunk up blood , xxunk being able to have acid come out of them , just use that as part of your entertainment ! because xxunk what really makes it xxunk . \n", - " \n", - " movies like this represent 80 ' s xxunk . never again could movies like this get made , know why ? it isn ' t the 80 ' s xxunk . that is why you should just xxunk them for what they are , good fun ! i highly xxunk this film if you ' re a hardcore fan of xxunk such as friday the 13 th . \n", - " \n", - " one last note this movie also had a kick ass xxunk as well , marty xxunk . a xxunk , nerd , who kills all his old xxunk in a xxunk xxunk mask . a good xxunk makes a good xxunk . simon xxunk , who played marty xxunk xxunk suicide xxunk after xxunk high was released . that alone xxunk something xxunk to the film , and sticks with it and it even makes you feel more sorry for the marty character , i guess . all in all , great 80 ' s xxunk fun ! it ' s a shame it will never be the same again xxunk,xxbos the basic formula for the original series was ; take someone , get the audience to like them , then put them into xxunk danger . this formula xxunk for the 32 xxunk made between 1964 - 68. \n", - " \n", - " now , we jump forward 40 years to xxunk we are xxunk to alan tracy , a xxunk xxunk college school kid , with his friend , fermat , a young xxunk . they are xxunk off by lady xxunk in her pink ford xxunk to the island paradise where the tracy family live , for the school xxunk . almost xxunk , they are left in the care of xxunk and his daughter , xxunk xxunk the xxunk go to rescue john from xxunk 5 which has been xxunk by a xxunk xxunk . this is all part of the hood ' s scheme to take over tracy island so that he can steal the xxunk machines xxunk \n", - " \n", - " xxunk to rob a bank ! \n", - " \n", - " yes . the plot xxup is as xxunk as that ! \n", - " \n", - " the dialogue is xxunk , the acting more wooden than that of the ( xxunk ) puppets , the effects , anything but special and hans xxunk ' s score xxunk what little there was of barry gray ' s glorious theme xxunk through xxunk ' s xxunk xxunk . the rest of the score was xxunk xxunk . in fact , part of the score was broadcast the following week on the radio and didn ' t xxunk it ! i didn ' t even xxunk to stay to witness xxunk ' s xxunk xxunk with the end titles \n", - " \n", - " to be fair , ron cook xxunk xxunk well as parker , he and sophia myles as xxunk xxunk xxunk . with the right material , they could have been show xxunk . the xxup cgi work was what i would have called leading edge - 5 years ago . \n", - " \n", - " the dynamics of the main craft were just wrong ; the original series models at least xxunk as if they had mass \n", - " \n", - " another xxunk point is that the whole production xxunk to be one long set of product xxunk , from every vehicle being built by ford to the entire content of the tracy xxunk being produced by ben & jerry ' s . \n", - " \n", - " my son ( 9 ) xxunk the film but this cross between spy kids and ' xxunk ', xxunk xxunk at his age group , added nothing to the xxunk legend . when star trek hit the big screen in 1979 with ' the motion picture ', a whole new xxunk of life was xxunk into the franchise which then continued for another 20 years or so . with this film , xxunk has xxunk a golden opportunity to do the same with the xxunk franchise . \n", - " \n", - " i xxunk that this film , like ' the avengers ' and ' the saint ' before it , will sink into xxunk within 6 months , leaving the original series to its ' classic ' status .,xxbos the views of earth that are xxunk in this film to have been xxunk by xxup nasa have xxunk been xxunk with the historical weather data for the time of apollo 11, and show a good match between the cloud patterns in the video sequence and the xxunk xxunk records on the day . \n", - " \n", - " this would xxunk to xxunk the entire argument put forward in the film that the \" whole earth \" picture is xxunk a small part of the planet framed by the spacecraft window . \n", - " \n", - " i am waiting for bart xxunk to now xxunk that the historical weather data has been xxunk by xxup nasa , though that would no doubt xxunk them in also xxunk every xxunk newspaper copy with a weather map , and the ones in private hands would still be a problem . \n", - " \n", - " ah , a response : \" xxunk to xxunk this movie by xxunk to xxup nasa weather data i ' d say is a xxunk , but weak and xxunk argument . what about the rest of the xxunk and xxunk in the movie ? a certain wise man once said something about xxunk xxunk and xxunk xxunk . do you in any way feel that maybe this could xxunk to what you are xxunk to do here ? : - ) this movie is just packed with xxunk evidence against the xxunk once made by u . s . government that the xxunk were a success , and that man now are true masters of the universe . things are xxunk never xxunk what they xxunk .. just watch the movie , and i dear say you ' ll see things a bit different than before .\" \n", - " \n", - " first off , weather data doesn ' t come from xxup nasa , it comes for met xxunk around the world . second , the weather data xxunk a major xxunk in the film . third , far from being \" packed with xxunk evidence \", the remaining xxunk in the film have been xxunk xxunk . xxunk thought he had a xxunk secret piece of film , so he edited it and added his own interpretation . xxunk for him , his source film is public domain , and the xxunk xxunk edited out xxunk his xxunk .,xxbos xxup swing ! is an important film because it ' s one of the remaining xxunk and xxunk films from the 1930 s . many of these films have simply xxunk so xxunk that they are xxunk , but this one is in xxunk good shape . it ' s also a nice chance to see many of the talented black xxunk of the period just after the xxunk of the old cotton xxunk time all but xxunk today . \n", - " \n", - " xxunk , while the film is xxunk important and has some lovely performances , it ' s also a mess . the main plot is very similar to the hollywood xxunk of the xxunk a prima donna who is going to xxunk the show and the surprise unknown who xxunk from no where to save the day . however , the writing is just xxunk and a bit xxunk at xxunk projects images of black america that some might find a bit xxunk . this is because before the plot really gets going , you are xxunk to a xxunk xxunk who lives off his hard working wife ( a popular stereotype of the time ) and when he is xxunk with a xxunk ( who , by the way , xxunk xxunk this role ), they have a fight which xxunk like a scene from xxup wwe smackdown ! and , the one lady wants to cut the other lady with a straight xxunk xxunk scene xxunk ! later in the film , when the prima donna is xxunk xxunk , her husband xxunk her in the face and everyone xxunk him ! it xxunk like the film , at times , wants to appeal to the xxunk common xxunk in the audience xxup plus they can ' t even do this xxunk some of the worst acting i ' ve seen in a very long time . \n", - " \n", - " still , if you can look past a xxunk production in just about every way ( with xxunk characters , bad acting and direction and poor writing ), this one might be worth a xxunk so you can see excellent singing and tap xxunk well as to catch a xxunk of xxunk black culture . just don ' t say i didn ' t xxunk you about the xxunk ' s really , really bad !\n", - "y: LMLabelList\n", - ",,,,\n", - "Path: /root/.fastai/data/imdb_sample;\n", - "\n", - "Test: None, model=SequentialRNN(\n", - " (0): AWD_LSTM(\n", - " (encoder): Embedding(60005, 400, padding_idx=1)\n", - " (encoder_dp): EmbeddingDropout(\n", - " (emb): Embedding(60005, 400, padding_idx=1)\n", - " )\n", - " (rnns): ModuleList(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (2): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (3): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " )\n", - " (input_dp): RNNDropout()\n", - " (hidden_dps): ModuleList(\n", - " (0): RNNDropout()\n", - " (1): RNNDropout()\n", - " (2): RNNDropout()\n", - " (3): RNNDropout()\n", - " )\n", - " )\n", - " (1): LinearDecoder(\n", - " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", - " (output_dp): RNNDropout()\n", - " )\n", - "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('/root/.fastai/data/imdb_sample'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[...], layer_groups=[Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): Embedding(60005, 400, padding_idx=1)\n", - " (1): EmbeddingDropout(\n", - " (emb): Embedding(60005, 400, padding_idx=1)\n", - " )\n", - " (2): LinearDecoder(\n", - " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", - " (output_dp): RNNDropout()\n", - " )\n", - ")], add_time=True, silent=False, cb_fns_registered=False)\n", - "alpha: 2\n", - "beta: 1], layer_groups=[Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(400, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 1550, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): WeightDropout(\n", - " (module): LSTM(1550, 400, batch_first=True)\n", - " )\n", - " (1): RNNDropout()\n", - "), Sequential(\n", - " (0): Embedding(60005, 400, padding_idx=1)\n", - " (1): EmbeddingDropout(\n", - " (emb): Embedding(60005, 400, padding_idx=1)\n", - " )\n", - " (2): LinearDecoder(\n", - " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", - " (output_dp): RNNDropout()\n", - " )\n", - ")], add_time=True, silent=False, cb_fns_registered=False)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 7 - } + "data": { + "text/plain": [ + "60005" ] + }, + "execution_count": 6, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "#check vocab size\n", + "len(data_lm.vocab.itos)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "colab_type": "code", + "id": "B9DJVRZ-KROb", + "outputId": "54031979-d708-4550-d0b7-8cd42b07cf96" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "SwJK_G80KROl", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 122 - }, - "outputId": "2d1fdff0-66e7-4f96-93fc-7e112c4306c8" - }, - "source": [ - "print(learn.predict('กาลครั้งหนึ่งนานมาแล้ว ', 200, temperature=0.8, min_p=0.005, sep = ''))" - ], - "execution_count": 24, - "outputs": [ - { - "output_type": "stream", - "text": [ - "กาลครั้งหนึ่งนานมาแล้ว คุณวันจันทร์ได้รับการอุปการะจากแม่เธอ \n", - " \n", - " วันต่อมา เธอได้พบกับ \"อาเธอร์ โอลด์เดน\" เด็กหนุ่มที่มีบุคลิกคล้ายกับ \"เอมิลี\" ซึ่งเป็นน้องสาวของนาง เขาจึงได้รับการเลี้ยงดูจาก \"อาเธอร์ ดีอา\" และเป็นผู้ที่คอยดูแลเธออยู่เสมอ เธอได้แนะนำให้เธอเป็นผู้หญิง \n", - " \n", - " โดยมี \"เอลซ่า\" ซึ่งเป็นทายาทของ \"อาเธอร์ ยูจีน\" ผู้เป็นสามีของเขา และเคยช่วยยูนิตที่ถูกส่งตัวไปประจำอยู่ที่ดินแดนแห่งนี้ เธอได้พบกับ \"ยูลิสซิส เกรย์ เอลิซาเบธ เอลลิส \" (เอมิลี่ ไอเซนฮา) ซึ่งเป็นชาว \"เผ่าเอลฟ์ \" และเธอก็ไม่ค่อยมีบุตร แต่เธอก็ได้รับความช่วยเหลือจาก \"\n" - ], - "name": "stdout" - } + "data": { + "text/plain": [ + "LanguageLearner(data=TextLMDataBunch;\n", + "\n", + "Train: LabelList (800 items)\n", + "x: LMTextList\n", + "xxbos every once in a long while a movie will come along that will be so xxunk that i feel xxunk to xxunk people . if i labor all my days and i can save but one soul from xxunk this movie , how great will be my joy . \n", + " \n", + " where to begin my xxunk of pain . for xxunk , there was a musical xxunk every five minutes . there was no character development . every character was a stereotype . we had xxunk guy , fat guy who xxunk xxunk , xxunk foreign guy , etc . the script xxunk as if it were being written as the movie was being shot . the production value was so xxunk low that it xxunk like i was xxunk a junior high video presentation . have the directors , producers , etc . ever even seen a movie before ? xxunk is getting worse and worse with every new entry . the concept for this movie xxunk so funny . how could you go wrong with gary coleman and a xxunk of xxunk xxunk actors . but trust me when i say this , things went wrong , xxup very xxup wrong .,xxbos name just says it all . i xxunk this movie with my dad when it came out and having xxunk in korea he had great xxunk for the man . the xxunk thing about this film is that it only xxunk on a short period of the man ' s life - xxunk enough the man ' s entire life would have made such an epic xxunk that it is xxunk to imagine the cost for production . \n", + " \n", + " some xxunk xxunk to the xxunk xxunk about the man , which are cheap xxunk . the theme of the movie \" duty , honor , country \" are not just xxunk words xxunk from the lips of a xxunk officer - it is the deep declaration of one man ' s total xxunk to his country . \n", + " \n", + " xxunk peck being the liberal that he was xxunk a better understanding of the man . he does a great job xxunk the fearless general xxunk with the humane side of the man .,xxbos this movie xxunk at being one of the most unique movies you ' ve seen . however this comes from the fact that you can ' t make heads or tails of this mess . it almost xxunk as a series of challenges set up to xxunk xxunk or not you are xxunk to walk out of the movie and give up the money you just paid . if you don ' t want to feel xxunk you ' ll sit through this xxunk film and xxunk a real sense of xxunk for the actors xxunk , they ' ve all seen better days , but then you xxunk they xxunk got paid xxunk a bit of money to do this and you ' ll lose xxunk for them just like you ' ve xxunk done for the film . i can ' t go on enough about this xxunk movie , its almost something that ed wood would have made and in that case it xxunk would have been his xxunk . \n", + " \n", + " to start you are forced to sit through an opening dialogue the xxunk of which you ' ve never seen / heard , this thing has got to be five minutes long . on top of that it is xxunk , as to xxunk that you the viewer cannot read . then we meet mr . xxunk and the xxunk of xxunk lines gets xxunk , it is as if he is operating xxunk to get lines on to the movie xxunk tag line . soon we meet stephen xxunk , who i xxunk xxunk ) and he does his best not to xxunk in this but xxunk he does . then comes the ultimate xxunk , tara reid playing an intelligent role , oh help us ! tara reid is not a very talented actress and xxunk she xxunk gets xxunk in movies , in my xxunk though she should stick to movies of the american pie type . \n", + " \n", + " all in all you just may want to see this for yourself when it comes out on video , i know that i got a kick out of it , i mean xxunk all be xxunk here , xxunk its xxunk to xxunk in the xxunk of others .,xxbos from the start , you know how this movie will end . it ' s so full of clich é s your typical xxup xxunk member will not even like this movie . i give it 2 out of 10, only because of the acting of william benton . i can ' t believe people xxunk 6 + for this movie . it ' s so biased towards a ' certain point of view ' ( once a thief xxunk people xxunk ' t born bad . neither are they born good . they are born with a clean slate . it ' s society , parents and education what makes them who they are . and if they take the wrong turn , somewhere down the line , it xxunk isn ' t going to be the american xxunk system that gets them back on track ! xxunk , xxunk this movie like the xxunk . i bet you have better things to do with your time than waste it on this piece of xxunk . \n", + " \n", + " ,xxbos i was xxunk enough to meet george pal ( and still have my xxup ds : xxup xxunk xxunk xxunk by him ) at a convention xxunk after the release , and xxunk him why he xxunk to do the film \" camp \". before he could answer , two studio xxunk xxunk and xxunk me on how the studio \" knew best \" and how \" no one will take such a film xxunk \". i had been reading the xxunk xxunk for a couple of years thanks to a friend ( xxunk xxunk of the 1970 s will recall xxunk and his band ? i was in a couple of years of that with him ), and had higher hopes than what we got . \n", + " \n", + " the xxunk xxunk that no high adventure would ever be done xxunk , and so doing ' camp ' was the only way . several other xxunk xxunk in on my side , with pal listening as best he could . at the end of the little event , pal came up to us and xxunk , xxunk he could have done more and better . \n", + " \n", + " xxup star xxup wars put the lie to the xxunk , and a year after pal ' s death , spielberg and lucas xxunk that doc savage could have xxunk been the next major movie franchise xxunk if it xxunk ' t been for the xxunk . \n", + " \n", + " tear out the memory or history of doc , and the film would have been worth a 6 / 10 rating as nothing more than a xxunk xxunk seller . \n", + " \n", + " but xxunk the legacy like that was no less an xxunk than killing a baby in the xxunk . \n", + " \n", + " doc savage can still come to the screen , and survive the xxunk xxunk by the xxunk to indiana jones , but it would have to be done in all xxunk and xxunk to xxunk the glory that we should expect from the first american xxunk . \n", + " \n", + " xxup xxunk : yes , there was a second script for xxup xxunk xxup of xxup evil , and it ' s a lot more serious . yes , there was xxunk xxunk shot , but mostly xxunk xxunk and very little with actors . and , yes , there _ is _ a xxunk of ron xxunk xxunk over a brick wall and xxunk at something over his shoulder with a xxunk built bronze xxunk . xxunk ' s xxunk a xxunk over a button down white shirt with a bronze tie , and the words \" xxup doc xxup savage : xxup xxunk xxup of xxup evil xxunk coming next summer !\" xxup xxunk : if anyone knows who the studio xxunk were that xxunk george pal in 1975 to san diego for the convention , xxunk the xxunk up the side of the head and call them the xxunk that they are . at the time , they were doing xxunk and fu xxunk in stripes and xxunk canvas xxunk , and carrying paramount xxunk .\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: /root/.fastai/data/imdb_sample;\n", + "\n", + "Valid: LabelList (200 items)\n", + "x: LMTextList\n", + "xxbos does any one know what the 2 sports cars were ? i think robert stack ' s might have been a xxunk . rock hudson ' s character told his father he was taking a job in iraq , isn ' t that xxunk ? i have had xxunk malone in my xxunk bank most of my life , maybe this was the film that xxunk me . xxunk xxunk sure did have some xxunk in this film and xxunk xxunk malone but xxunk ' s part made a more xxunk impact so she got the oscar for best supporting role . was xxunk ' s part xxunk a leading role ? old man xxunk character was was xxunk a pretty common picture of xxunk of his era in that he was a regular guy who made it big in an emerging industry but in building a whole town he had xxunk his children to have his wife bring them up . in time , being xxunk he xxunk that they were all he really had and they were xxunk rotten , looking for attention , so rather than try to xxunk to his children he xxunk his head off . an ancient morality tale . but xxunk , what were those sports cars ?,xxbos god bless 80 ' s xxunk films . this is a fun , fun movie . this is what xxunk films are all about . now i ' m not xxunk horror movies , just xxunk films . it goes like this : a high school nerd is xxunk on by all these stupid xxunk and xxunk , and then one of their xxunk goes xxunk wrong . xxunk and back for revenge , xxunk a joker / xxunk mask ( pretty xxunk looking , might i add ), marty begins to kill off those xxunk one by one many years later , after he xxunk to make them believe that their old xxunk high school is having a reunion . that is xxunk the plot ? what ' s wrong with that ? that ' s the beauty of 80 ' s xxunk films , most of them i would say . a lot of things could be so xxunk , but they keep drawing you more in an ' in as they go by . xxunk this film . \n", + " \n", + " it features some xxunk xxunk , and some are xxunk creative as well . ( poisoning of a xxunk can , acid bath , i can ' t remember a xxunk ever being used before in any other xxunk film either ) it really is a fun , fun movie . that ' s all it is . nevermind the fact that the characters are complete xxunk , never mind their xxunk , and never mind the xxunk , random things that xxunk in this film . such as lights being able to be controlled by the killer ( when he ' s not even switching any xxunk , you ' ll see ) and xxunk being able to xxunk up blood , xxunk being able to have acid come out of them , just use that as part of your entertainment ! because xxunk what really makes it xxunk . \n", + " \n", + " movies like this represent 80 ' s xxunk . never again could movies like this get made , know why ? it isn ' t the 80 ' s xxunk . that is why you should just xxunk them for what they are , good fun ! i highly xxunk this film if you ' re a hardcore fan of xxunk such as friday the 13 th . \n", + " \n", + " one last note this movie also had a kick ass xxunk as well , marty xxunk . a xxunk , nerd , who kills all his old xxunk in a xxunk xxunk mask . a good xxunk makes a good xxunk . simon xxunk , who played marty xxunk xxunk suicide xxunk after xxunk high was released . that alone xxunk something xxunk to the film , and sticks with it and it even makes you feel more sorry for the marty character , i guess . all in all , great 80 ' s xxunk fun ! it ' s a shame it will never be the same again xxunk,xxbos the basic formula for the original series was ; take someone , get the audience to like them , then put them into xxunk danger . this formula xxunk for the 32 xxunk made between 1964 - 68. \n", + " \n", + " now , we jump forward 40 years to xxunk we are xxunk to alan tracy , a xxunk xxunk college school kid , with his friend , fermat , a young xxunk . they are xxunk off by lady xxunk in her pink ford xxunk to the island paradise where the tracy family live , for the school xxunk . almost xxunk , they are left in the care of xxunk and his daughter , xxunk xxunk the xxunk go to rescue john from xxunk 5 which has been xxunk by a xxunk xxunk . this is all part of the hood ' s scheme to take over tracy island so that he can steal the xxunk machines xxunk \n", + " \n", + " xxunk to rob a bank ! \n", + " \n", + " yes . the plot xxup is as xxunk as that ! \n", + " \n", + " the dialogue is xxunk , the acting more wooden than that of the ( xxunk ) puppets , the effects , anything but special and hans xxunk ' s score xxunk what little there was of barry gray ' s glorious theme xxunk through xxunk ' s xxunk xxunk . the rest of the score was xxunk xxunk . in fact , part of the score was broadcast the following week on the radio and didn ' t xxunk it ! i didn ' t even xxunk to stay to witness xxunk ' s xxunk xxunk with the end titles \n", + " \n", + " to be fair , ron cook xxunk xxunk well as parker , he and sophia myles as xxunk xxunk xxunk . with the right material , they could have been show xxunk . the xxup cgi work was what i would have called leading edge - 5 years ago . \n", + " \n", + " the dynamics of the main craft were just wrong ; the original series models at least xxunk as if they had mass \n", + " \n", + " another xxunk point is that the whole production xxunk to be one long set of product xxunk , from every vehicle being built by ford to the entire content of the tracy xxunk being produced by ben & jerry ' s . \n", + " \n", + " my son ( 9 ) xxunk the film but this cross between spy kids and ' xxunk ', xxunk xxunk at his age group , added nothing to the xxunk legend . when star trek hit the big screen in 1979 with ' the motion picture ', a whole new xxunk of life was xxunk into the franchise which then continued for another 20 years or so . with this film , xxunk has xxunk a golden opportunity to do the same with the xxunk franchise . \n", + " \n", + " i xxunk that this film , like ' the avengers ' and ' the saint ' before it , will sink into xxunk within 6 months , leaving the original series to its ' classic ' status .,xxbos the views of earth that are xxunk in this film to have been xxunk by xxup nasa have xxunk been xxunk with the historical weather data for the time of apollo 11, and show a good match between the cloud patterns in the video sequence and the xxunk xxunk records on the day . \n", + " \n", + " this would xxunk to xxunk the entire argument put forward in the film that the \" whole earth \" picture is xxunk a small part of the planet framed by the spacecraft window . \n", + " \n", + " i am waiting for bart xxunk to now xxunk that the historical weather data has been xxunk by xxup nasa , though that would no doubt xxunk them in also xxunk every xxunk newspaper copy with a weather map , and the ones in private hands would still be a problem . \n", + " \n", + " ah , a response : \" xxunk to xxunk this movie by xxunk to xxup nasa weather data i ' d say is a xxunk , but weak and xxunk argument . what about the rest of the xxunk and xxunk in the movie ? a certain wise man once said something about xxunk xxunk and xxunk xxunk . do you in any way feel that maybe this could xxunk to what you are xxunk to do here ? : - ) this movie is just packed with xxunk evidence against the xxunk once made by u . s . government that the xxunk were a success , and that man now are true masters of the universe . things are xxunk never xxunk what they xxunk .. just watch the movie , and i dear say you ' ll see things a bit different than before .\" \n", + " \n", + " first off , weather data doesn ' t come from xxup nasa , it comes for met xxunk around the world . second , the weather data xxunk a major xxunk in the film . third , far from being \" packed with xxunk evidence \", the remaining xxunk in the film have been xxunk xxunk . xxunk thought he had a xxunk secret piece of film , so he edited it and added his own interpretation . xxunk for him , his source film is public domain , and the xxunk xxunk edited out xxunk his xxunk .,xxbos xxup swing ! is an important film because it ' s one of the remaining xxunk and xxunk films from the 1930 s . many of these films have simply xxunk so xxunk that they are xxunk , but this one is in xxunk good shape . it ' s also a nice chance to see many of the talented black xxunk of the period just after the xxunk of the old cotton xxunk time all but xxunk today . \n", + " \n", + " xxunk , while the film is xxunk important and has some lovely performances , it ' s also a mess . the main plot is very similar to the hollywood xxunk of the xxunk a prima donna who is going to xxunk the show and the surprise unknown who xxunk from no where to save the day . however , the writing is just xxunk and a bit xxunk at xxunk projects images of black america that some might find a bit xxunk . this is because before the plot really gets going , you are xxunk to a xxunk xxunk who lives off his hard working wife ( a popular stereotype of the time ) and when he is xxunk with a xxunk ( who , by the way , xxunk xxunk this role ), they have a fight which xxunk like a scene from xxup wwe smackdown ! and , the one lady wants to cut the other lady with a straight xxunk xxunk scene xxunk ! later in the film , when the prima donna is xxunk xxunk , her husband xxunk her in the face and everyone xxunk him ! it xxunk like the film , at times , wants to appeal to the xxunk common xxunk in the audience xxup plus they can ' t even do this xxunk some of the worst acting i ' ve seen in a very long time . \n", + " \n", + " still , if you can look past a xxunk production in just about every way ( with xxunk characters , bad acting and direction and poor writing ), this one might be worth a xxunk so you can see excellent singing and tap xxunk well as to catch a xxunk of xxunk black culture . just don ' t say i didn ' t xxunk you about the xxunk ' s really , really bad !\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: /root/.fastai/data/imdb_sample;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): AWD_LSTM(\n", + " (encoder): Embedding(60005, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(60005, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " (1): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('/root/.fastai/data/imdb_sample'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[RNNTrainer\n", + "learn: LanguageLearner(data=TextLMDataBunch;\n", + "\n", + "Train: LabelList (800 items)\n", + "x: LMTextList\n", + "xxbos every once in a long while a movie will come along that will be so xxunk that i feel xxunk to xxunk people . if i labor all my days and i can save but one soul from xxunk this movie , how great will be my joy . \n", + " \n", + " where to begin my xxunk of pain . for xxunk , there was a musical xxunk every five minutes . there was no character development . every character was a stereotype . we had xxunk guy , fat guy who xxunk xxunk , xxunk foreign guy , etc . the script xxunk as if it were being written as the movie was being shot . the production value was so xxunk low that it xxunk like i was xxunk a junior high video presentation . have the directors , producers , etc . ever even seen a movie before ? xxunk is getting worse and worse with every new entry . the concept for this movie xxunk so funny . how could you go wrong with gary coleman and a xxunk of xxunk xxunk actors . but trust me when i say this , things went wrong , xxup very xxup wrong .,xxbos name just says it all . i xxunk this movie with my dad when it came out and having xxunk in korea he had great xxunk for the man . the xxunk thing about this film is that it only xxunk on a short period of the man ' s life - xxunk enough the man ' s entire life would have made such an epic xxunk that it is xxunk to imagine the cost for production . \n", + " \n", + " some xxunk xxunk to the xxunk xxunk about the man , which are cheap xxunk . the theme of the movie \" duty , honor , country \" are not just xxunk words xxunk from the lips of a xxunk officer - it is the deep declaration of one man ' s total xxunk to his country . \n", + " \n", + " xxunk peck being the liberal that he was xxunk a better understanding of the man . he does a great job xxunk the fearless general xxunk with the humane side of the man .,xxbos this movie xxunk at being one of the most unique movies you ' ve seen . however this comes from the fact that you can ' t make heads or tails of this mess . it almost xxunk as a series of challenges set up to xxunk xxunk or not you are xxunk to walk out of the movie and give up the money you just paid . if you don ' t want to feel xxunk you ' ll sit through this xxunk film and xxunk a real sense of xxunk for the actors xxunk , they ' ve all seen better days , but then you xxunk they xxunk got paid xxunk a bit of money to do this and you ' ll lose xxunk for them just like you ' ve xxunk done for the film . i can ' t go on enough about this xxunk movie , its almost something that ed wood would have made and in that case it xxunk would have been his xxunk . \n", + " \n", + " to start you are forced to sit through an opening dialogue the xxunk of which you ' ve never seen / heard , this thing has got to be five minutes long . on top of that it is xxunk , as to xxunk that you the viewer cannot read . then we meet mr . xxunk and the xxunk of xxunk lines gets xxunk , it is as if he is operating xxunk to get lines on to the movie xxunk tag line . soon we meet stephen xxunk , who i xxunk xxunk ) and he does his best not to xxunk in this but xxunk he does . then comes the ultimate xxunk , tara reid playing an intelligent role , oh help us ! tara reid is not a very talented actress and xxunk she xxunk gets xxunk in movies , in my xxunk though she should stick to movies of the american pie type . \n", + " \n", + " all in all you just may want to see this for yourself when it comes out on video , i know that i got a kick out of it , i mean xxunk all be xxunk here , xxunk its xxunk to xxunk in the xxunk of others .,xxbos from the start , you know how this movie will end . it ' s so full of clich é s your typical xxup xxunk member will not even like this movie . i give it 2 out of 10, only because of the acting of william benton . i can ' t believe people xxunk 6 + for this movie . it ' s so biased towards a ' certain point of view ' ( once a thief xxunk people xxunk ' t born bad . neither are they born good . they are born with a clean slate . it ' s society , parents and education what makes them who they are . and if they take the wrong turn , somewhere down the line , it xxunk isn ' t going to be the american xxunk system that gets them back on track ! xxunk , xxunk this movie like the xxunk . i bet you have better things to do with your time than waste it on this piece of xxunk . \n", + " \n", + " ,xxbos i was xxunk enough to meet george pal ( and still have my xxup ds : xxup xxunk xxunk xxunk by him ) at a convention xxunk after the release , and xxunk him why he xxunk to do the film \" camp \". before he could answer , two studio xxunk xxunk and xxunk me on how the studio \" knew best \" and how \" no one will take such a film xxunk \". i had been reading the xxunk xxunk for a couple of years thanks to a friend ( xxunk xxunk of the 1970 s will recall xxunk and his band ? i was in a couple of years of that with him ), and had higher hopes than what we got . \n", + " \n", + " the xxunk xxunk that no high adventure would ever be done xxunk , and so doing ' camp ' was the only way . several other xxunk xxunk in on my side , with pal listening as best he could . at the end of the little event , pal came up to us and xxunk , xxunk he could have done more and better . \n", + " \n", + " xxup star xxup wars put the lie to the xxunk , and a year after pal ' s death , spielberg and lucas xxunk that doc savage could have xxunk been the next major movie franchise xxunk if it xxunk ' t been for the xxunk . \n", + " \n", + " tear out the memory or history of doc , and the film would have been worth a 6 / 10 rating as nothing more than a xxunk xxunk seller . \n", + " \n", + " but xxunk the legacy like that was no less an xxunk than killing a baby in the xxunk . \n", + " \n", + " doc savage can still come to the screen , and survive the xxunk xxunk by the xxunk to indiana jones , but it would have to be done in all xxunk and xxunk to xxunk the glory that we should expect from the first american xxunk . \n", + " \n", + " xxup xxunk : yes , there was a second script for xxup xxunk xxup of xxup evil , and it ' s a lot more serious . yes , there was xxunk xxunk shot , but mostly xxunk xxunk and very little with actors . and , yes , there _ is _ a xxunk of ron xxunk xxunk over a brick wall and xxunk at something over his shoulder with a xxunk built bronze xxunk . xxunk ' s xxunk a xxunk over a button down white shirt with a bronze tie , and the words \" xxup doc xxup savage : xxup xxunk xxup of xxup evil xxunk coming next summer !\" xxup xxunk : if anyone knows who the studio xxunk were that xxunk george pal in 1975 to san diego for the convention , xxunk the xxunk up the side of the head and call them the xxunk that they are . at the time , they were doing xxunk and fu xxunk in stripes and xxunk canvas xxunk , and carrying paramount xxunk .\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: /root/.fastai/data/imdb_sample;\n", + "\n", + "Valid: LabelList (200 items)\n", + "x: LMTextList\n", + "xxbos does any one know what the 2 sports cars were ? i think robert stack ' s might have been a xxunk . rock hudson ' s character told his father he was taking a job in iraq , isn ' t that xxunk ? i have had xxunk malone in my xxunk bank most of my life , maybe this was the film that xxunk me . xxunk xxunk sure did have some xxunk in this film and xxunk xxunk malone but xxunk ' s part made a more xxunk impact so she got the oscar for best supporting role . was xxunk ' s part xxunk a leading role ? old man xxunk character was was xxunk a pretty common picture of xxunk of his era in that he was a regular guy who made it big in an emerging industry but in building a whole town he had xxunk his children to have his wife bring them up . in time , being xxunk he xxunk that they were all he really had and they were xxunk rotten , looking for attention , so rather than try to xxunk to his children he xxunk his head off . an ancient morality tale . but xxunk , what were those sports cars ?,xxbos god bless 80 ' s xxunk films . this is a fun , fun movie . this is what xxunk films are all about . now i ' m not xxunk horror movies , just xxunk films . it goes like this : a high school nerd is xxunk on by all these stupid xxunk and xxunk , and then one of their xxunk goes xxunk wrong . xxunk and back for revenge , xxunk a joker / xxunk mask ( pretty xxunk looking , might i add ), marty begins to kill off those xxunk one by one many years later , after he xxunk to make them believe that their old xxunk high school is having a reunion . that is xxunk the plot ? what ' s wrong with that ? that ' s the beauty of 80 ' s xxunk films , most of them i would say . a lot of things could be so xxunk , but they keep drawing you more in an ' in as they go by . xxunk this film . \n", + " \n", + " it features some xxunk xxunk , and some are xxunk creative as well . ( poisoning of a xxunk can , acid bath , i can ' t remember a xxunk ever being used before in any other xxunk film either ) it really is a fun , fun movie . that ' s all it is . nevermind the fact that the characters are complete xxunk , never mind their xxunk , and never mind the xxunk , random things that xxunk in this film . such as lights being able to be controlled by the killer ( when he ' s not even switching any xxunk , you ' ll see ) and xxunk being able to xxunk up blood , xxunk being able to have acid come out of them , just use that as part of your entertainment ! because xxunk what really makes it xxunk . \n", + " \n", + " movies like this represent 80 ' s xxunk . never again could movies like this get made , know why ? it isn ' t the 80 ' s xxunk . that is why you should just xxunk them for what they are , good fun ! i highly xxunk this film if you ' re a hardcore fan of xxunk such as friday the 13 th . \n", + " \n", + " one last note this movie also had a kick ass xxunk as well , marty xxunk . a xxunk , nerd , who kills all his old xxunk in a xxunk xxunk mask . a good xxunk makes a good xxunk . simon xxunk , who played marty xxunk xxunk suicide xxunk after xxunk high was released . that alone xxunk something xxunk to the film , and sticks with it and it even makes you feel more sorry for the marty character , i guess . all in all , great 80 ' s xxunk fun ! it ' s a shame it will never be the same again xxunk,xxbos the basic formula for the original series was ; take someone , get the audience to like them , then put them into xxunk danger . this formula xxunk for the 32 xxunk made between 1964 - 68. \n", + " \n", + " now , we jump forward 40 years to xxunk we are xxunk to alan tracy , a xxunk xxunk college school kid , with his friend , fermat , a young xxunk . they are xxunk off by lady xxunk in her pink ford xxunk to the island paradise where the tracy family live , for the school xxunk . almost xxunk , they are left in the care of xxunk and his daughter , xxunk xxunk the xxunk go to rescue john from xxunk 5 which has been xxunk by a xxunk xxunk . this is all part of the hood ' s scheme to take over tracy island so that he can steal the xxunk machines xxunk \n", + " \n", + " xxunk to rob a bank ! \n", + " \n", + " yes . the plot xxup is as xxunk as that ! \n", + " \n", + " the dialogue is xxunk , the acting more wooden than that of the ( xxunk ) puppets , the effects , anything but special and hans xxunk ' s score xxunk what little there was of barry gray ' s glorious theme xxunk through xxunk ' s xxunk xxunk . the rest of the score was xxunk xxunk . in fact , part of the score was broadcast the following week on the radio and didn ' t xxunk it ! i didn ' t even xxunk to stay to witness xxunk ' s xxunk xxunk with the end titles \n", + " \n", + " to be fair , ron cook xxunk xxunk well as parker , he and sophia myles as xxunk xxunk xxunk . with the right material , they could have been show xxunk . the xxup cgi work was what i would have called leading edge - 5 years ago . \n", + " \n", + " the dynamics of the main craft were just wrong ; the original series models at least xxunk as if they had mass \n", + " \n", + " another xxunk point is that the whole production xxunk to be one long set of product xxunk , from every vehicle being built by ford to the entire content of the tracy xxunk being produced by ben & jerry ' s . \n", + " \n", + " my son ( 9 ) xxunk the film but this cross between spy kids and ' xxunk ', xxunk xxunk at his age group , added nothing to the xxunk legend . when star trek hit the big screen in 1979 with ' the motion picture ', a whole new xxunk of life was xxunk into the franchise which then continued for another 20 years or so . with this film , xxunk has xxunk a golden opportunity to do the same with the xxunk franchise . \n", + " \n", + " i xxunk that this film , like ' the avengers ' and ' the saint ' before it , will sink into xxunk within 6 months , leaving the original series to its ' classic ' status .,xxbos the views of earth that are xxunk in this film to have been xxunk by xxup nasa have xxunk been xxunk with the historical weather data for the time of apollo 11, and show a good match between the cloud patterns in the video sequence and the xxunk xxunk records on the day . \n", + " \n", + " this would xxunk to xxunk the entire argument put forward in the film that the \" whole earth \" picture is xxunk a small part of the planet framed by the spacecraft window . \n", + " \n", + " i am waiting for bart xxunk to now xxunk that the historical weather data has been xxunk by xxup nasa , though that would no doubt xxunk them in also xxunk every xxunk newspaper copy with a weather map , and the ones in private hands would still be a problem . \n", + " \n", + " ah , a response : \" xxunk to xxunk this movie by xxunk to xxup nasa weather data i ' d say is a xxunk , but weak and xxunk argument . what about the rest of the xxunk and xxunk in the movie ? a certain wise man once said something about xxunk xxunk and xxunk xxunk . do you in any way feel that maybe this could xxunk to what you are xxunk to do here ? : - ) this movie is just packed with xxunk evidence against the xxunk once made by u . s . government that the xxunk were a success , and that man now are true masters of the universe . things are xxunk never xxunk what they xxunk .. just watch the movie , and i dear say you ' ll see things a bit different than before .\" \n", + " \n", + " first off , weather data doesn ' t come from xxup nasa , it comes for met xxunk around the world . second , the weather data xxunk a major xxunk in the film . third , far from being \" packed with xxunk evidence \", the remaining xxunk in the film have been xxunk xxunk . xxunk thought he had a xxunk secret piece of film , so he edited it and added his own interpretation . xxunk for him , his source film is public domain , and the xxunk xxunk edited out xxunk his xxunk .,xxbos xxup swing ! is an important film because it ' s one of the remaining xxunk and xxunk films from the 1930 s . many of these films have simply xxunk so xxunk that they are xxunk , but this one is in xxunk good shape . it ' s also a nice chance to see many of the talented black xxunk of the period just after the xxunk of the old cotton xxunk time all but xxunk today . \n", + " \n", + " xxunk , while the film is xxunk important and has some lovely performances , it ' s also a mess . the main plot is very similar to the hollywood xxunk of the xxunk a prima donna who is going to xxunk the show and the surprise unknown who xxunk from no where to save the day . however , the writing is just xxunk and a bit xxunk at xxunk projects images of black america that some might find a bit xxunk . this is because before the plot really gets going , you are xxunk to a xxunk xxunk who lives off his hard working wife ( a popular stereotype of the time ) and when he is xxunk with a xxunk ( who , by the way , xxunk xxunk this role ), they have a fight which xxunk like a scene from xxup wwe smackdown ! and , the one lady wants to cut the other lady with a straight xxunk xxunk scene xxunk ! later in the film , when the prima donna is xxunk xxunk , her husband xxunk her in the face and everyone xxunk him ! it xxunk like the film , at times , wants to appeal to the xxunk common xxunk in the audience xxup plus they can ' t even do this xxunk some of the worst acting i ' ve seen in a very long time . \n", + " \n", + " still , if you can look past a xxunk production in just about every way ( with xxunk characters , bad acting and direction and poor writing ), this one might be worth a xxunk so you can see excellent singing and tap xxunk well as to catch a xxunk of xxunk black culture . just don ' t say i didn ' t xxunk you about the xxunk ' s really , really bad !\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: /root/.fastai/data/imdb_sample;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): AWD_LSTM(\n", + " (encoder): Embedding(60005, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(60005, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " (1): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('/root/.fastai/data/imdb_sample'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[...], layer_groups=[Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): Embedding(60005, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(60005, 400, padding_idx=1)\n", + " )\n", + " (2): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)\n", + "alpha: 2\n", + "beta: 1], layer_groups=[Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): Embedding(60005, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(60005, 400, padding_idx=1)\n", + " )\n", + " (2): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)" ] + }, + "execution_count": 7, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,\n", + " output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)\n", + "trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)\n", + "\n", + "learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", + "\n", + "#load pretrained models\n", + "learn.load_pretrained(**_THWIKI_LSTM)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 122 }, + "colab_type": "code", + "id": "SwJK_G80KROl", + "outputId": "2d1fdff0-66e7-4f96-93fc-7e112c4306c8" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "MyKkpWZbMOzt", - "colab_type": "code", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] + "name": "stdout", + "output_type": "stream", + "text": [ + "กาลครั้งหนึ่งนานมาแล้ว คุณวันจันทร์ได้รับการอุปการะจากแม่เธอ \n", + " \n", + " วันต่อมา เธอได้พบกับ \"อาเธอร์ โอลด์เดน\" เด็กหนุ่มที่มีบุคลิกคล้ายกับ \"เอมิลี\" ซึ่งเป็นน้องสาวของนาง เขาจึงได้รับการเลี้ยงดูจาก \"อาเธอร์ ดีอา\" และเป็นผู้ที่คอยดูแลเธออยู่เสมอ เธอได้แนะนำให้เธอเป็นผู้หญิง \n", + " \n", + " โดยมี \"เอลซ่า\" ซึ่งเป็นทายาทของ \"อาเธอร์ ยูจีน\" ผู้เป็นสามีของเขา และเคยช่วยยูนิตที่ถูกส่งตัวไปประจำอยู่ที่ดินแดนแห่งนี้ เธอได้พบกับ \"ยูลิสซิส เกรย์ เอลิซาเบธ เอลลิส \" (เอมิลี่ ไอเซนฮา) ซึ่งเป็นชาว \"เผ่าเอลฟ์ \" และเธอก็ไม่ค่อยมีบุตร แต่เธอก็ได้รับความช่วยเหลือจาก \"\n" + ] } - ] -} \ No newline at end of file + ], + "source": [ + "print(learn.predict('กาลครั้งหนึ่งนานมาแล้ว ', 200, temperature=0.8, min_p=0.005, sep = ''))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "MyKkpWZbMOzt" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "text_generation.ipynb", + "provenance": [], + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/notebooks/visualize.py b/notebooks/visualize.py new file mode 100644 index 000000000..5cfda4824 --- /dev/null +++ b/notebooks/visualize.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +""" +Visualization of text data +""" + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from typing import Collection, Callable, Tuple + +__all__ = ['top_feats_label', 'top_feats_all', 'plot_top_feats'] + +def top_feats_label(X: np.ndarray, features: Collection[str], label_idx: Collection[bool] = None, + min_val: float = 0.1, agg_func: Callable = np.mean)->pd.DataFrame: + ''' + original code (Thomas Buhrman)[from https://buhrmann.github.io/tfidf-analysis.html] + rank features of each label by their encoded values (CountVectorizer, TfidfVectorizer, etc.) + aggregated with `agg_func` + :param X np.ndarray: document-value matrix + :param features Collection[str]: feature names + :param label_idx Collection[int]: position of rows with specified label + :param min_val float: minimum value to take into account for each feature + :param agg_func Callable: how to aggregate features such as `np.mean` or `np.sum` + :return: a dataframe with `feature`, `score` and `ngram` + ''' + res = X[label_idx] if label_idx is not None else X + res[res < min_val] = 0 + res_agg = agg_func(res, axis=0) + df = pd.DataFrame([(features[i], res_agg[i]) for i in np.argsort(res_agg)[::-1]]) + df.columns = ['feature','score'] + df['ngram'] = df.feature.map(lambda x: len(set(x.split(' ')))) + return df + +def top_feats_all(X: np.ndarray, y: np.ndarray, features: Collection[str], min_val: float = 0.1, + agg_func: Callable = np.mean)->Collection[pd.DataFrame]: + ''' + original code (Thomas Buhrman)[from https://buhrmann.github.io/tfidf-analysis.html] + for all labels, rank features of each label by their encoded values (CountVectorizer, TfidfVectorizer, etc.) + aggregated with `agg_func` + :param X np.ndarray: document-value matrix + :param y np.ndarray: labels + :param features Collection[str]: feature names + :param min_val float: minimum value to take into account for each feature + :param agg_func Callable: how to aggregate features such as `np.mean` or `np.sum` + :return: a list of dataframes with `rank` (rank within label), `feature`, `score`, `ngram` and `label` + ''' + labels = np.unique(y) + dfs = [] + for l in labels: + label_idx = (y==l) + df = top_feats_label(X,features,label_idx,min_val,agg_func).reset_index() + df['label'] = l + df.columns = ['rank','feature','score','ngram','label'] + dfs.append(df) + return dfs + +def plot_top_feats(dfs: Collection[pd.DataFrame], top_n: int = 25, ngram_range: Tuple[int,int]=(1,2),)-> None: + ''' + original code (Thomas Buhrman)[from https://buhrmann.github.io/tfidf-analysis.html] + plot top features from a collection of `top_feats_all` dataframes + :param dfs Collection[pd.DataFrame]: `top_feats_all` dataframes + :param top_n int: number of top features to show + :param ngram_range Tuple[int,int]: range of ngrams for features to show + :return: nothing + ''' + fig = plt.figure(figsize=(12, 9), facecolor="w") + x = np.arange(top_n) + for i, df in enumerate(dfs): + df = df[(df.ngram>=ngram_range[0])&(df.ngram<=ngram_range[1])][:top_n] + ax = fig.add_subplot(1, len(dfs), i+1) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + ax.set_frame_on(False) + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.set_xlabel("score", labelpad=16, fontsize=14) + ax.set_title(f"label = {str(df.label[0])}", fontsize=16) + ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) + ax.barh(x, df.score, align='center', color='#3F5D7D') + ax.set_yticks(x) + ax.set_ylim([-1, x[-1]+1]) + ax.invert_yaxis() + yticks = ax.set_yticklabels(df.feature) + plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52) + plt.show() diff --git a/notebooks/wongnai_classification.ipynb b/notebooks/wongnai_classification.ipynb index 59d1ae760..e9196f29f 100644 --- a/notebooks/wongnai_classification.ipynb +++ b/notebooks/wongnai_classification.ipynb @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -53,10 +53,11 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", - "from fastai.text import *\n", - "from fastai.callbacks import CSVLogger\n", + "# from fastai.text import *\n", + "# from fastai.callbacks import CSVLogger\n", "\n", "from pythainlp import word_tokenize\n", + "from pythainlp.ulmfit import process_thai\n", "\n", "ft_data = 'ft_data/'" ] @@ -77,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -91,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -105,7 +106,7 @@ "Name: rating, dtype: float64" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -116,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -130,7 +131,7 @@ "Name: rating, dtype: float64" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -285,7 +286,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Code for LinearSVC is provided by [@lukkiddd](https://github.com/lukkiddd).\n", + "Code for LinearSVC is initially provided by [@lukkiddd](https://github.com/lukkiddd). `pythainlp.ulmfit.process_thai` contains text cleaning rules with the default aimed for sparse models like bag of words. It contains `pre_rules` applied before tokenization and `post_rules` applied after.\n", "\n", "| model | micro_f1_public | micro_f1_private | \n", "|-----------|-----------------|------------------|\n", @@ -294,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -304,13 +305,13 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e941bc57e041489ea16998e1c3f9cddd", + "model_id": "01cf5b2e041d4ce0a482073127c67c8b", "version_major": 2, "version_minor": 0 }, @@ -331,7 +332,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6672a556b6f44e829982f95efd89aa95", + "model_id": "097725d69a834eb5bda2b6d4b6c0c1e3", "version_major": 2, "version_minor": 0 }, @@ -351,25 +352,17 @@ } ], "source": [ - "import string\n", - "def process_text(text):\n", - " nopunc = [char for char in text if char not in string.punctuation]\n", - " nopunc = ''.join(nopunc)\n", - " return [word for word in word_tokenize(nopunc, engine='ulmfit') if word and not re.search(pattern=r\"\\s+\", string=word)]\n", - "def split_text(text):\n", - " return text.split()\n", - "\n", "train_splits = []\n", "test_splits = []\n", "for i in tqdm_notebook(range(train_bal.shape[0])):\n", - " train_splits.append(' '.join(process_text(train_bal['review'][i])))\n", + " train_splits.append(' '.join(process_thai(train_bal['review'][i])))\n", "for i in tqdm_notebook(range(test_df.shape[0])):\n", - " test_splits.append(' '.join(process_text(test_df['review'][i])))" + " test_splits.append(' '.join(process_thai(test_df['review'][i])))" ] }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -385,7 +378,7 @@ " verbose=0))])" ] }, - "execution_count": 94, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -396,7 +389,7 @@ "from sklearn.svm import LinearSVC\n", "\n", "text_clf = Pipeline([\n", - " ('vect', CountVectorizer(tokenizer=split_text, ngram_range=(1,2))),\n", + " ('vect', CountVectorizer(tokenizer=process_thai, ngram_range=(1,2))),\n", " ('tfidf', TfidfTransformer()),\n", " ('clf', LinearSVC()),\n", "])\n", @@ -406,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -415,7 +408,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ diff --git a/notebooks/word2vec_examples.ipynb b/notebooks/word2vec_examples.ipynb new file mode 100644 index 000000000..8d0243f3f --- /dev/null +++ b/notebooks/word2vec_examples.ipynb @@ -0,0 +1,1195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Thai2Vec Embeddings Examples\n", + "\n", + "We use the embeddings from `v0.1` since it was trained specifically for word2vec as opposed to latter versions which garner to classification. The `thai2vec.bin` 51,556 word embeddings of 300 dimensions, in descending order by their frequencies (See `thai2vec.vocab`). The files are in word2vec format readable by `gensim`. Most common applications include word vector visualization, word arithmetic, word grouping, cosine similarity and sentence or document vectors. For sample code, see `thwiki_lm/word2vec_examples.ipynb`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# #uncomment if you are running from google colab\n", + "# !pip install sklearn_crfsuite\n", + "# !pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "# !pip install fastai==1.0.46\n", + "# !pip install emoji" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-28T04:55:05.294165Z", + "start_time": "2018-01-28T04:55:05.203078Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Download: thai2fit_wv\n", + "thai2fit_wv 0.1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "thai2vec.bin?dl=1: 62.5MB [00:15, 4.09MB/s] \n" + ] + } + ], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline\n", + "\n", + "from pythainlp.tokenize import word_tokenize\n", + "from gensim.models import KeyedVectors\n", + "import numpy as np\n", + "\n", + "from sklearn.manifold import TSNE\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.font_manager as fm\n", + "\n", + "import dill as pickle\n", + "import pandas as pd\n", + "\n", + "model_path = 'thwiki_data/models/'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#load into gensim\n", + "# model = KeyedVectors.load_word2vec_format(f'{model_path}thai2vec.bin',binary=True)\n", + "from pythainlp import word_vector\n", + "model = word_vector.get_model()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-28T04:58:27.057401Z", + "start_time": "2018-01-28T04:58:09.721798Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...290291292293294295296297298299
ที่0.308956-0.0976990.1167450.2156120.015768-0.0641630.0621680.0396490.8649400.846904...-0.1424180.0332410.171581-0.624864-0.0093580.4491310.120130-0.122195-0.450617-0.071318
และ0.010751-0.6189710.1296650.035460-0.0075600.0276070.3978240.0265430.2540750.168328...-0.1057860.180930-0.1016300.070885-0.0372630.183606-0.049088-0.672288-1.2930440.592576
เป็น-0.015736-0.2589260.0529530.153728-0.005985-0.0210810.0410880.0573121.6332300.442729...-0.009408-0.252576-0.3055120.3725420.0491510.5684700.2665860.400800-0.7846500.197369
ของ-0.189711-0.1747740.171124-0.1867710.054294-0.114150-1.109456-0.094466-0.4470150.042377...-0.168676-0.1487380.6804040.0977020.0202700.182967-0.0839490.006287-0.707434-0.070234
มี-0.156962-0.2318630.0803120.3231570.2156950.0551450.4207940.0168420.2567590.832864...-0.044267-0.147186-0.1054240.9070780.0092990.5509530.1393370.031696-0.670379-0.008048
ได้-0.428813-0.0311940.041922-0.036608-0.0081060.076470-0.7822700.0333610.6068640.440520...0.024458-0.0250310.103389-0.0782550.0343230.459774-0.7486430.337775-0.487408-0.511535
\"\"\"\"-0.2877100.0641930.2050760.146356-0.071343-0.039451-1.8454610.1637631.0180960.272786...0.051024-0.532856-0.131856-0.090323-0.0588950.151262-0.4203580.055971-0.9308140.163908
การ0.239587-0.3036200.079953-0.453045-0.528826-0.1616920.235725-0.0996730.6916680.536159...-0.110436-0.297495-0.2174140.0451580.0666470.190095-0.304333-0.724927-0.995488-0.716609
(-0.120522-0.3557830.168180-0.377733-0.158624-0.0472490.3611400.1614600.9133140.345037...0.116285-0.318218-0.3566640.5198890.1304750.1257720.101328-0.382658-1.2053590.340139
)-0.086848-0.1552310.133015-0.0399130.1837610.115142-1.940854-0.066565-2.3997440.146722...0.019406-0.1814740.0998630.5160920.2016970.2491390.2529571.138815-0.0182090.232265
\n", + "

10 rows × 300 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "ที่ 0.308956 -0.097699 0.116745 0.215612 0.015768 -0.064163 0.062168 \n", + "และ 0.010751 -0.618971 0.129665 0.035460 -0.007560 0.027607 0.397824 \n", + "เป็น -0.015736 -0.258926 0.052953 0.153728 -0.005985 -0.021081 0.041088 \n", + "ของ -0.189711 -0.174774 0.171124 -0.186771 0.054294 -0.114150 -1.109456 \n", + "มี -0.156962 -0.231863 0.080312 0.323157 0.215695 0.055145 0.420794 \n", + "ได้ -0.428813 -0.031194 0.041922 -0.036608 -0.008106 0.076470 -0.782270 \n", + "\"\"\"\" -0.287710 0.064193 0.205076 0.146356 -0.071343 -0.039451 -1.845461 \n", + "การ 0.239587 -0.303620 0.079953 -0.453045 -0.528826 -0.161692 0.235725 \n", + "( -0.120522 -0.355783 0.168180 -0.377733 -0.158624 -0.047249 0.361140 \n", + ") -0.086848 -0.155231 0.133015 -0.039913 0.183761 0.115142 -1.940854 \n", + "\n", + " 7 8 9 ... 290 291 292 \\\n", + "ที่ 0.039649 0.864940 0.846904 ... -0.142418 0.033241 0.171581 \n", + "และ 0.026543 0.254075 0.168328 ... -0.105786 0.180930 -0.101630 \n", + "เป็น 0.057312 1.633230 0.442729 ... -0.009408 -0.252576 -0.305512 \n", + "ของ -0.094466 -0.447015 0.042377 ... -0.168676 -0.148738 0.680404 \n", + "มี 0.016842 0.256759 0.832864 ... -0.044267 -0.147186 -0.105424 \n", + "ได้ 0.033361 0.606864 0.440520 ... 0.024458 -0.025031 0.103389 \n", + "\"\"\"\" 0.163763 1.018096 0.272786 ... 0.051024 -0.532856 -0.131856 \n", + "การ -0.099673 0.691668 0.536159 ... -0.110436 -0.297495 -0.217414 \n", + "( 0.161460 0.913314 0.345037 ... 0.116285 -0.318218 -0.356664 \n", + ") -0.066565 -2.399744 0.146722 ... 0.019406 -0.181474 0.099863 \n", + "\n", + " 293 294 295 296 297 298 299 \n", + "ที่ -0.624864 -0.009358 0.449131 0.120130 -0.122195 -0.450617 -0.071318 \n", + "และ 0.070885 -0.037263 0.183606 -0.049088 -0.672288 -1.293044 0.592576 \n", + "เป็น 0.372542 0.049151 0.568470 0.266586 0.400800 -0.784650 0.197369 \n", + "ของ 0.097702 0.020270 0.182967 -0.083949 0.006287 -0.707434 -0.070234 \n", + "มี 0.907078 0.009299 0.550953 0.139337 0.031696 -0.670379 -0.008048 \n", + "ได้ -0.078255 0.034323 0.459774 -0.748643 0.337775 -0.487408 -0.511535 \n", + "\"\"\"\" -0.090323 -0.058895 0.151262 -0.420358 0.055971 -0.930814 0.163908 \n", + "การ 0.045158 0.066647 0.190095 -0.304333 -0.724927 -0.995488 -0.716609 \n", + "( 0.519889 0.130475 0.125772 0.101328 -0.382658 -1.205359 0.340139 \n", + ") 0.516092 0.201697 0.249139 0.252957 1.138815 -0.018209 0.232265 \n", + "\n", + "[10 rows x 300 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#create dataframe\n", + "thai2dict = {}\n", + "for word in model.index2word:\n", + " thai2dict[word] = model[word]\n", + "thai2vec = pd.DataFrame.from_dict(thai2dict,orient='index')\n", + "thai2vec.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using t-SNE, we can compress the 300 dimensions of each word into a 2D plane and plot their relationships." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-26T02:35:36.291285Z", + "start_time": "2018-01-26T02:35:36.264520Z" + } + }, + "outputs": [], + "source": [ + "labels = model.index2word\n", + "\n", + "# #tnse\n", + "# tsne = TSNE(n_components=2, init='pca', n_iter=1000)\n", + "# thai2plot = tsne.fit_transform(thai2vec)\n", + "# pickle.dump(thai2plot,open(f'{model_path}thai2plot.pkl','wb'))\n", + "\n", + "thai2plot = pickle.load(open(f'{model_path}thai2plot.pkl','rb'))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "labels[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-26T02:35:39.610920Z", + "start_time": "2018-01-26T02:35:38.469383Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#stolen from https://blog.manash.me/how-to-use-pre-trained-word-vectors-from-facebooks-fasttext-a71e6d55f27\n", + "def plot_with_labels(low_dim_embs, labels, filename=None, figsize=(10,10),\n", + " axis_lims = None):\n", + " assert low_dim_embs.shape[0] >= len(labels), \"More labels than embeddings\"\n", + " plt.figure(figsize=figsize) # in inches\n", + " for i, label in enumerate(labels):\n", + " x, y = low_dim_embs[i, :]\n", + " plt.scatter(x, y)\n", + " prop = fm.FontProperties(fname=f'THSarabunNew.ttf',size=20)\n", + " plt.annotate(label,\n", + " fontproperties=prop,\n", + " xy=(x, y),\n", + " xytext=(5, 2),\n", + " textcoords='offset points',\n", + " ha='right',\n", + " va='bottom')\n", + " if axis_lims is not None: plt.axis(axis_lims)\n", + " if filename: plt.savefig(filename)\n", + " \n", + "plot_with_labels(thai2plot[200:500],labels[200:500],axis_lims = [0,30,0,30])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Word Arithmetic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can do simple \"arithmetic\" with words based on the word vectors such as:\n", + "* ผู้หญิง + พระราชา - ผู้ชาย = พระราชินี\n", + "* นายกรัฐมนตรี - อำนาจ = ประธานาธิบดี\n", + "* กิ้งก่า + โบราณ = ไดโนเสาร์" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T08:48:16.434798Z", + "start_time": "2018-01-25T08:48:16.312043Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('พระราชินี', 0.7954867482185364),\n", + " ('กษัตริย์', 0.7382755279541016),\n", + " ('พระเจ้า', 0.7046602368354797),\n", + " ('เจ้าชาย', 0.6979373097419739),\n", + " ('พระมหากษัตริย์', 0.6972416639328003),\n", + " ('เจ้าฟ้าหญิง', 0.6871017217636108),\n", + " ('พระเจ้าแผ่นดิน', 0.6827988624572754),\n", + " ('พระพุทธเจ้า', 0.671796977519989),\n", + " ('มกุฎราชกุมาร', 0.6711805462837219),\n", + " ('นายพล', 0.6694187521934509)]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#word arithmetic\n", + "model.most_similar_cosmul(positive=['พระราชา','ผู้หญิง'], negative=['ผู้ชาย'])" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T08:48:18.493511Z", + "start_time": "2018-01-25T08:48:18.104736Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlMAAAJCCAYAAADky0LWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3XuUnXV97/HPLxdMuCjFBAFRqbfgKUakowRILAGxQQviElGLHlxis8B1rLJcoGCB1h48UGwRBLlUEQUKplEqF0nDtSKXSBAb5ZgiokgknkQuQaIhmeR3/phhTNIkk+Q3M3tCXq+1WGQ/+5m9v/NjEt48z7MfSq01AABsnhGdHgAAYEsmpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaDBqKN9s3LhxdY899hjKtwQA2Cz33Xffb2qt4/vbb0hjao899sjcuXOH8i0BADZLKeWRjdnPaT4AgAZiCgCggZgCAGggpgCAAfXss8/miCOOyN/93d8lSW677bbMmzevw1MNniG9AB0AeP569tln8973vjd77bVXli1bllmzZuXQQw/N5z//+cyYMaPT4w0aR6YAgAFRa82yZcty//33Z+XKlXn00Ufz1a9+Needd1622267To83aByZAgAGxMiRI5Mk48aNyw477JCLLroor3rVqzo81eATUwDAgBg9enRuuOGGvqjaWjjNBwAMmNaQuuqqq3LYYYflwAMPzHvf+9488MADAzTZ4BFTAMCwMGfOnJx00klZsmRJuru7s3Tp0rzvfe/Lb37zm06PtkFO8wEAw8K+++6bRx99NEnS3d2dn/3sZ5k9e3YefPDBjBs3rsPTrZ8jUwDAgHjmmWcyefLkTJkyJQsWLEiS3H777TnzzDPzjW98I1OnTk1XV1dOP/30JMnFF1+cyy+/PEkyY8aMHHDAAZkyZUo+9KEPJUkmTJiQww8/PJ/5zGcyZcqUzJ8/P0kybdq0of/mNsCRKQCgyTPPPJNp06bl6aefTldXV4466qhceOGF2WGHHXL77bdn3rx5efrppzN58uRcdtllmThxYq699trsvvvumTFjRu6888589KMfzV577ZXu7u6sWrUqM2bMyF/+5V/mlFNOycqVKzN27NiMHTs2Dz/8cO6+++4ceOCB+c53vpNtt92209++I1MAQJv7778/++yzTz7wgQ/kuuuuy/nnn5/99tsvK1asyFNPPZX3v//92WWXXXLiiSfmX//1X/Pyl7887373u7Nw4cKMHDkyp556am666abcfvvtmTVrVubOnZuHHnooSc9tFnbZZZfMnj07r3jFK3Lrrbdmjz32yOTJk4fNxeliCgBosnLlylx77bW55ppr8tGPfjTXX399tt9++4wePTqvfvWr841vfCNjx47N2WefnTFjxuTnP/95Lr/88rzqVa/Kf/3Xf2X//ffPWWedlRUrVmT77bfPxIkT+y46nzRpUubOnZtJkyblK1/5Sg4//PD86le/yuWXX56VK1d2+Dvv4TQfANDsuOOOy6RJk3LPPfdk+fLl+exnP5tJkyblIx/5SF760pdmyZIlueiiizJhwoRcc801OeusszJ27NiMHz8+y5cvz4knnpjRo0cnScaMGZPu7u4kyQMPPJDXv/71mTlzZh555JHsvPPO6erqyqRJkzr57a7BkSkAYECsWLEiV111VT75yU9m3333zQ033JBaa9/zc+fOzTbbbJPXv/71GTXqD8dzfvSjH+VP//RP1/majz/+eHbbbbesXLkyf/M3f9N3EfpwIqYAgAFx2WWXZeTIkZk9e3be9ra3Zc8998wFF1yQJCml5Oyzz86+++7bdwrviSeeyLbbbpvly5dnv/32y8UXX/zfXnPHHXfMz3/+84wZMyb//M//nBNPPDELFy4c0u+rP2IKAGh2991359lnn83SpUvzqU99KhdccEHe+MY39j1/11135ZBDDslxxx2XY489NnPmzMkee+yRF77whRk1alRuvvnmXHfddUmSsWPHZunSpUmSnXfeOaNGjcoBBxyQww8/PIsWLco222zTdz+q4cA1UwBAs/nz5+f+++/vu1XBd77znSTJzJkz84tf/CKzZ8/Ohz/84YwYMSJz5szJtGnTct555yVJ3vWud2Xy5Mn58z//8yTJKaeckqOPPjpTp07NC17wgsycOTPbb79933s99thjmTJlSv7qr/5qiL/LdSurn8scbF1dXXXu3LlD9n4AwNC69NJLc8UVV+TMM8/MbrvtlgsvvDBnnHHGJr3G5z73udx666254oorsmzZsnz961/PaaedNkgTr18p5b5aa1d/+znNBwA0u/TSS3PQQQfl4YcfzuOPP55Pf/rT6e7uzuOPP77JrzV//vzst99+WbhwYWbOnJnXvOY1gzDxwBFTAECzWmt++9vfZocddsipp56aJUuW5He/+90an9rbWNOnT8+sWbNywgkn5Gc/+1mOOuqoQZh44LhmCgBoduyxx2bPPffM7Nmz88pXvjJLlizJxz72sU0+xZckkydPzr333tv3+IaHb8j/ueefsmT5oqxasWO2XXpYPvNnR+eIN750IL+FzSamAIABsc8+++Tcc8/N9773vVxxxRUDcmPNGx6+Iad+7/SsqM8mJRmxzVP5/airc8rs7iTHDIugElMAwIAYO3ZsZsyYMaCvee4Pzu0JqdWUEStSdroxZ//7pGERU66ZAgCGrV8v/fU6t5fRT+Wxp34/xNOsm5gCAIatXbbbZZ3b64ods9uOY4d4mnUTUwDAsPXxfT6e0eUFa2yrq0anPnFoTvzzCR2aak2umQIAhq13vPIdSfLfP833Np/mAwDYKO945Tv6omo4cpoPAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoEG/MVVKubSUsqiU8uO1tn+slDK/lPJAKeUfBm9EAIDha2OOTF2WZNrqG0opU5O8M8kbaq1/kuTzAz8aAMDw129M1Vq/m+SJtTYfn+TMWuuzvfssGoTZAACGvc29Zuq1SaaUUuaUUv6jlPKmgRwKAGBLMarh63ZKMinJm5LMKKW8stZa196xlDI9yfQkefnLX765cwIADEube2RqQZJv1R7fT7Iqybh17VhrvaTW2lVr7Ro/fvzmzgkAMCxtbkz9W5KpSVJKeW2SbZL8ZqCGAgDYUvR7mq+UclWSA5OMK6UsSHJ6kkuTXNp7u4TlSY5Z1yk+AIDnu35jqtb6/vU89YEBngUAYIvjDugAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQoN+YKqVcWkpZVEr58Tqe+2QppZZSxg3OeAAAw9vGHJm6LMm0tTeWUl6W5G1JfjnAMwEAbDH6jala63eTPLGOp85JclKSOtBDAQBsKTbrmqlSyjuT/KrW+p8bse/0UsrcUsrcxYsXb87bAQAMW5scU6WUbZOckuS0jdm/1npJrbWr1to1fvz4TX07AIBhbXOOTL0qyR8n+c9Syi+S7J7kB6WUXQZyMACALcGoTf2CWuuPkuz83OPeoOqqtf5mAOcCANgibMytEa5KcneSCaWUBaWUYwd/LACALUO/R6Zqre/v5/k9BmwaAIAtjDugAwA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAAN+o2pUsqlpZRFpZQfr7bt7FLK/FLKvFLKNaWUHQd3TACA4WljjkxdlmTaWttuSrJXrXVikgeTnDzAcwEAbBH6jala63eTPLHWttm11u7eh/ck2X0QZgMAGPYG4pqpDye5cX1PllKml1LmllLmLl68eADeDgBg+GiKqVLKZ5J0J7lyffvUWi+ptXbVWrvGjx/f8nYAAMPOqM39wlLKh5L8RZKDa611wCYCANiCbFZMlVKmJTkpyZ/VWn83sCMBAGw5NubWCFcluTvJhFLKglLKsUnOT7JDkptKKT8spVw0yHMCAAxL/R6ZqrW+fx2bvzIIswAAbHHcAR0AoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaNBvTJVSLi2lLCql/Hi1bTuVUm4qpfy09+9/NLhjAgAMTxtzZOqyJNPW2vbpJLfUWl+T5JbexwAAW51+Y6rW+t0kT6y1+Z1Jvtb7668lOWKA5wIA2CJs7jVTL6m1Luz99a+TvGR9O5ZSppdS5pZS5i5evHgz3w4AYHhqvgC91lqT1A08f0mttavW2jV+/PjWtwMAGFY2N6b+Xyll1yTp/fuigRsJAGDLsbkxdW2SY3p/fUySbw/MOAAAW5aNuTXCVUnuTjKhlLKglHJskjOTHFJK+WmSt/Y+BgDY6ozqb4da6/vX89TBAzwLAMAWxx3QAQAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGYgoAoIGYAgBoIKYAABqIKQCABmIKAKCBmAIAaCCmAAAaiCkAgAZiCgCggZgCAGggpgAAGogpAIAGTTFVSjmhlPJAKeXHpZSrSiljBmowAIAtwWbHVCnlpUn+OklXrXWvJCOTvG+gBgMA2BK0nuYblWRsKWVUkm2TPNY+EgDAlmOzY6rW+qskn0/yyyQLkyyptc4eqMEAALYELaf5/ijJO5P8cZLdkmxXSvnAOvabXkqZW0qZu3jx4s2fFABgGGo5zffWJD+vtS6uta5I8q0k+6+9U631klprV621a/z48Q1vBwAw/LTE1C+TTCqlbFtKKUkOTvKTgRkLAGDL0HLN1JwkM5P8IMmPel/rkgGaCwBgizCq5YtrracnOX2AZgEA2OK4AzoAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3E1CZYtmxZlixZ0ukxAIBhRExtgnvuuScXXnhhp8cAAIaRUZ0eoFOWLl2a0047LfPmzUt3d3cmT56cU045JWPHju3bZ8aMGbnyyivz9NNPZ9y4cXnLW97SwYkBgOFoqz0ydfLJJ+e2227LM888kyS566678olPfKLv+XvvvTef/OQn8+STT2bFihX5/e9/nzPOOKNT4wIAw9RWe2TqvPPO6/v10qVLc++99+bWW2/NihUrMnr06LzpTW/Ko48+miRZuXJlfvGLX+Tmm2/OS17ykk6NDAAMQ6XWOmRv1tXVVefOnbtJX9Pd3Z1Rozat+bq7uzNy5MiUUjbp6wAAnlNKua/W2tXffkN6mu+pp57KPvvsk49//ONJknPPPTennHJKkmTatGlZvHhxpk6dmgMPPDAHHnhgDj744Hz7299Okpx99tl585vfnOdi7Nprr83FF1+cJPnmN7+Zfffdt+9r3/Oe9+TXv/5133tMnjw5++23Xz71qU/1zfKFL3whBx10UPbZZ5++1zn55JNzxx135Pvf/366urry7ne/e435r7766lx22WWDt0AAwBZnSGPqySefzKxZszJ//vwkPdclPfTQQ33P33nnnVm1alUOOeSQnH/++bnlllv6guaXv/xlvvSlL+WKK65IkvzgBz/I6173uiTJzJkz87rXvS7HHHNMrrnmmlxzzTXZdddd88Mf/jB33XVX7rjjjtx999154oknctddd+WRRx7J6aefnhe/+MX5j//4j3z1q1/NfffdlwULFmTy5Mm59tpr86UvfSljx47NokWLhnKJAIAtzJDG1Pjx4/Oud70r8+bNy4oVKzJq1KjssssueeKJJ5IkRxxxRG677bYcdthhmTlzZo4++ug8dxryqaeeykknnZQ3vvGNmTdvXu68886+T9ddddVVueCCC/KSl7wkxx9/fP7lX/4lSfKyl70s559/ft/pvgMOOCCPPvpoHn/88Rx33HGZNm1afvrTn2bPPffMk08+mVJKSin54Ac/mJNOOik333xzVq1aNZRLBABsYYY0prbffvvceeedecMb3pC77747BxxwQA455JDccsstfxhoxIhMnDgxf/u3f5sRI0Zk8eLFSZJ//Md/zO67754vf/nL+cxnPpNLLrlkjdfebrvtcuihh+bKK6/sO3r14he/OOPHj19jv1pr9t577zzyyCOZNWtWbr/99hxzzDF561vf2ndqcMKECbn99tszbdq0wVwOAOB5oGOf5rvpppsyffr07LTTTvn0pz+9zn1GjhzZd2Ro5513zte//vV+X3f1r1mfESNG5Oqrr06SvOMd78gJJ5zQt31D5s+fn4kTJ/Y7AwCw9ejIfaZGjBiRxx57LC972cuy3XbbZdmyZWt88u6hhx7K1KlTc+ONN/Ztu/DCC3PkkUdmzpw5SZJbb7013/rWt9Z43Ysuuij777//Js2ycuXKfj/1t3z58nzxi1/MWWedlde+9rWb9PoAwPNbR2Jq5MiRGTduXN/jCRMmrPH/vLviiityzjnnZMqUKRk5cmSSnptqTps2Lddff32WLVuWL37xi9lrr73WeN1Zs2blxhtvzDbbbLPRs4wePTpPP/10kp7/997aZsyYkbe97W25+OKL8/DDD+fv//7vN+l7BQCe3zoSU7/97W9z0EEH9T2eNm1a30XoSXLYYYdl+vTp6e7u7rvm6eijj855552Xm2++OYccckg++MEP/rejRHvvvXemTp2aI488cqNnOf744/PWt741+++/f97+9rf/t+dnzZqVWbNm5Zhjjsnhhx+erq5+bzcBAGxFBuWmnaWUUbXW7rW3b+imnW60CQAMJwN2085SyuGllB+UUs7tffzxUsrnen89q5QyvpRyWynl9t6/bknyzt7nTyylfL+U0pX03N5gsG60uaGvAwAYLBtzmu+oJNOS7Nn7+E1JXr3a8wf0vs5NSf5XrfXgWus3e597eZKPJvlAkvzud78btBttru/rWiy57rr89KCD85PX/Y/89KCDs+S665peDwB4/tmYmLowyTVJJpZSRifpTvLrUspOSVJr/bckU5Ncl+TIUsqV5Q/n6nZM8g9J7i+lTHzmmWcG7Uab6/u6zbXkuuuy8NTT0v3YY0mt6X7ssSw89TRBBQCsod+YqrXeWWs9IMl/JtkvyZ3pOQp18Gr7rKq1zqu1/m2SVUmeu1PmJ5MsSPKRJGe84hWvWOO1B/JGm+v7us216JwvpK716b66bFkWnfOFzX5NAOD5Z1Nv2nlIkkuSPJHkzPXsszK9kVZrXZTkfz73RFdX1zrrZjBvtLm5uhcu3KTtAMDWaVNKZFWS3Wqtj9ZalyYZk6Qvjkopry6l3Jbk0NW2HV9KmVlK2TdJnn766SG70WarUbvuuknbAYCt06bE1Mokv1nt8X8ledFqjz+Q5IQkd/TumyT7J5mV5C9KKWMWL148JDfaHAg7n/CJlDFj1thWxozJzid8YlDeDwDYMm3Kab4dkty62uNZSaaXUmYm2SvJC9Lzyb8Ha62Le/e5Mj0XoC9NcuBOO+203httfuITGx8pz91oc9SoUTniiCM24VvYeC867LAkPddOdS9cmFG77pqdT/hE33YAgKTxpp2llK8luaPW+uVSyjZJtq21PrW+/Td0004AgOFkY2/auakXoK/+Bi9K8pYkH0qSWuvyJMs39/WazZuR3PLZZMmC5EW7Jweflkw8qmPjAABbh5aPwv1xksVJvlpKub+U8uVSynYDNNemmTcjue6vkyWPJqk9f7/ur3u2AwAMopaYGpVknyQX1lrfmJ7roj699k6llOmllLmllLmLFy9e++mBcctnkxW/X3Pbit/3bAcAGEQtMbUgyYJa65zexzPTE1drqLVeUmvtqrV2rX1TzQGzZMGmbQcAGCCbHVO11l8nebSUMqF308FJ/u+ATLWpXrT7pm0HABggrbcP/1iSK0sp85LsneRz7SNthoNPS0aPXXPb6LE92wEABtFmf5ovSWqtP0zS70cGB91zn9rzaT4AYIg1xdSwMvEo8QQADLnB+b8EAwBsJcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADQQUwAADcQUAEADMQUA0EBMAQA0EFMAAA3EFABAAzEFANBATAEANCi11qF7s1IWJ3lkyN5w+BuX5DedHmILYJ36Z436Z402jnXqnzXaOM+HdXpFrXV8fzsNaUyxplLK3FprV6fnGO6sU/+sUf+s0caxTv2zRhtna1onp/kAABqIKQCABmKqsy7p9ABbCOvUP2vUP2u0caxT/6zRxtlq1sk1UwAADRyZAgBoIKaGSCllTCnl+6WU/yylPFBK+bu1nj+vlPJMp+YbDta3RqWUy0opPy+l/LD3r707PWsnbWCdSinljFLKg6WUn5RS/rrTs3bKBtbojtV+jh4rpfxbp2ftlA2s0cGllB/0rtH3Simv7vSsnbSBdTqod51+XEr5WillVKdn7bRSyshSyv2llOt7H/9xKWVOKeWhUso3SinbdHrGwbLV/8MfQs8mOajW+kwpZXSS75VSbqy13lNK6UryRx2ebzhY5xr1PndirXVmB2cbTta3Tq9L8rIke9ZaV5VSdu7olJ21vt9vU57boZTyzSTf7tiEnbe+n6MLk7yz1vqTUspHk/xNkg91cM5OW9c6/XuSryU5uNb6YCnls0mOSfKVTg46DHw8yU+SvLD38VlJzqm1Xl1KuSjJsen5+XrecWRqiNQezx15Gt37Vy2ljExydpKTOjbcMLG+NergSMPSBtbp+CSfrbWu6t1vUYdG7Lj+fpZKKS9MclCSrfbI1AbWqOYP/zJ8UZLHOjDesLGedVqZZHmt9cHe7TcleXcn5hsuSim7J3lHki/3Pi7p+T323H8Efy3JEZ2ZbvCJqSHUewj0h0kWJbmp1jonyf9Kcm2tdWFnpxse1rNGSXJGKWVeKeWcUsoLOjjisLCedXraeSuiAAACtUlEQVRVkveWUuaWUm4spbyms1N21gZ+lpKeP9RvqbU+3Znphof1rNFHknynlLIgyQeTnNnJGYeDtdcpyfeTjOo9q5AkR6bnqPDW7AvpOSiwqvfxi5M8VWvt7n28IMlLOzHYUBBTQ6jWurLWuneS3ZO8uZTyliTvSfLFzk42fKxjjfZKcnKSPZO8KclOST7VwRGHhfWs0wuSLOu94/A/J7m0kzN22nrW6DnvT3JVZyYbPtazRickeXutdfckX03yT52ccThYe52S/EmS9yU5p5Ty/SS/Tc/Rqq1SKeUvkiyqtd7X6Vk6RUx1QK31qSS3JZma5NVJHiql/CLJtqWUhzo523Cx2hpNq7Uu7D3U/mx6/nB/c2enGz5WX6f0/Jfft3qfuibJxE7NNZystUYppYxLz8/QDZ2cazhZbY0OTfKG1Y7ifSPJ/h0bbJhZ68+lu2utU2qtb07y3SQPbvirn9cOSHJ477/Hrk7P6b1zk+y42oX5uyf5VWfGG3xiaoiUUsaXUnbs/fXYJIckua/WukutdY9a6x5Jfldr3Wo/ObOeNZpfStm1d1tJz+mZH3duys5b3zql5/qfqb27/Vm24j/cN7BGSc8pmetrrcs6Nd9wsJ41+kmSF5VSXtu723Pbtlob+HNp595tL0jP0fKLOjdlZ9VaT6617t7777H3Jbm11np0esLzyN7djsnz+AMfPs03dHZN8rXeC85HJJlRa72+wzMNN+tco1LKraWU8UlKkh8mOa6TQw4D61un7yW5spRyQpJn0nPty9ZqQ7/f3hfXASXr/zn6qyTfLKWsSvJkkg93cshhYH3rdHbv6a0RSS6std7a0SmHp08lubqU8r+T3J/n8acd3QEdAKCB03wAAA3EFABAAzEFANBATAEANBBTAAANxBQAQAMxBQDQQEwBADT4//H2HoL60kUdAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sample_words = ['ผู้หญิง','พระราชา','ผู้ชาย','พระราชินี']\n", + "sample_idx = []\n", + "for word in sample_words:\n", + " sample_idx.append(labels.index(word))\n", + "sample_plot = thai2plot[sample_idx]\n", + "plot_with_labels(sample_plot,sample_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T08:48:50.851735Z", + "start_time": "2018-01-25T08:48:50.731913Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('รองนายกรัฐมนตรี', 1.4945054054260254),\n", + " ('รัฐมนตรี', 1.400755763053894),\n", + " ('ประธานาธิบดี', 1.3626699447631836),\n", + " ('พันเอก', 1.3437265157699585),\n", + " ('ผู้บัญชาการทหารบก', 1.3405414819717407),\n", + " ('ผู้กำกับภาพยนตร์', 1.3339321613311768),\n", + " ('นักฟุตบอล', 1.331659197807312),\n", + " ('เอกอัครราชทูต', 1.3306005001068115),\n", + " ('แห้ง', 1.3243674039840698),\n", + " ('สุภาพสตรี', 1.3231494426727295)]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.most_similar_cosmul(positive=['นายกรัฐมนตรี'],negative=['อำนาจ'])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T08:48:51.481402Z", + "start_time": "2018-01-25T08:48:51.158689Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sample_words = ['นายกรัฐมนตรี','อำนาจ','รองนายกรัฐมนตรี']\n", + "sample_idx = []\n", + "for word in sample_words:\n", + " sample_idx.append(labels.index(word))\n", + "sample_plot = thai2plot[sample_idx]\n", + "plot_with_labels(sample_plot,sample_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T08:49:11.465639Z", + "start_time": "2018-01-25T08:49:11.336707Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('สิ่งมีชีวิต', 0.537461519241333),\n", + " ('สัตว์เลี้ยงลูกด้วยนม', 0.5080005526542664),\n", + " ('แมลง', 0.5048903226852417),\n", + " ('ผลไม้', 0.4839756190776825),\n", + " ('มนุษย์', 0.47641509771347046),\n", + " ('ผัก', 0.46431201696395874),\n", + " ('สัตว์น้ำ', 0.45941096544265747),\n", + " ('ต้นไม้', 0.45185261964797974),\n", + " ('พันธุ์ไม้', 0.4504697620868683),\n", + " ('ไม้ยืนต้น', 0.44425833225250244)]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#word arithmetic\n", + "model.most_similar_cosmul(positive=['สัตว์','พืช'], negative=[])" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T08:49:12.202620Z", + "start_time": "2018-01-25T08:49:11.835817Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sample_words = ['สัตว์','พืช','สิ่งมีชีวิต']\n", + "sample_idx = []\n", + "for word in sample_words:\n", + " sample_idx.append(labels.index(word))\n", + "sample_plot = thai2plot[sample_idx]\n", + "plot_with_labels(sample_plot,sample_words)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "## Doesn't Match" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "It can also be used to do word groupings. For instance:\n", + "* อาหารเช้า อาหารสัตว์ อาหารเย็น อาหารกลางวัน - อาหารสัตว์ is type of food whereas others are meals in the day\n", + "* ลาก ดึง ดูด ดัน - ดัน is pushing while the rest is pulling.\n", + "* กด กัด กิน เคี้ยว - กด is not verbs for the eating process\n", + "Note that this could be relying on a different \"take\" than you would expect. For example, you could have answered ลูกเขย in the second example because it is the one associated with male gender." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-26T02:35:24.619900Z", + "start_time": "2018-01-26T02:35:24.505717Z" + }, + "hidden": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/gensim/models/keyedvectors.py:858: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n", + " vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)\n" + ] + }, + { + "data": { + "text/plain": [ + "'อาหารสัตว์'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.doesnt_match(\"อาหารเช้า อาหารสัตว์ อาหารเย็น อาหารกลางวัน\".split())" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-26T02:35:46.163075Z", + "start_time": "2018-01-26T02:35:45.838017Z" + }, + "hidden": true + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlwAAAJCCAYAAAAVwBlbAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3XuUnXV99/3PLwfAsioIogGR03NTQFkD4lQFEsgBIRoCiIVqUwF9QjhYIIqiLFpNxVaoBwgiFnqL+hRr5EYRATkKqchBGoQAalBOIhBkQIk3h5DT7/kjwwgaSIbxlz3JvF5rzcrsa1+H71yOy7fXvvaeUmsNAADtDOv0AAAAazvBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoLERnR7g+V796lfXrbbaqtNjAACs1C233PJYrXWTVVl3UAXXVlttlTlz5nR6DACAlSql/GpV1/WSIgBAY4ILAKAxwQUArNXuuuuufP/73+/oDIILAFgr/exnP8sRRxyRK6+8MuPHj+/oLIILAFgrnXnmmUmS0aNHZ7311uvoLIILAFgr7bDDDrnrrruyySar9MkNTQkuAGBQuvPOO7PXXntl6tSpfcsOP/zwLF68eJW2P+aYYzJ79uxsvvnmrUZcZYILABiUfvjDH+aNb3xj7rvvvixatCj/+Z//mSVLlmTkyJGdHq3fBBcAMChNnTo1G264YZYtW5b99tsvDz/8cP793/+902O9LIPqk+YBgKFryZIlGTHiD2myzjrr5J//+Z+TJDfccEM23XTTrLvuukmSRx55JKNGjcrChQvz9NNPZ6ONNurIzKtKcAEASZKZM2fmoosuSpI888wz2W+//XLiiSfmiSeeyOGHH56enp4sWrQon/70pzN+/PjcfPPNOfroo7PNNtvk/PPPT5Kce+65OfPMM/O3f/u3+djHPpYHHnggRx55ZJ5++unUWnPGGWdkp512Sk9PTw4++ODUWpMkw4cPz9FHH513v/vdueuuu3LSSSfld7/7XUaMGJEtt9wyPT09+cY3vpEkOfXUU3PNNddk7Nixue222/LQQw/lS1/6UnbaaacsWLCgb9Zaa3beeed8/vOfz/DhwztzUp9Tax00X29+85srALD6XX311XXmzJl9j5cuXVqPP/74+tWvfrVOnz69XnLJJbXWWh9//PE6evToWmutJ510Uv3xj39cp0yZUh999NFaa6377rtvXbhwYT3wwANrrbW++93vrnPnzq211nrPPffUSZMm1VprvfDCC+see+xRP/3pT9c77rjjBbN88IMfrPPnz6+11vrAAw/Uo48+uk6ePLnWWuuCBQv6vj/vvPPqvHnz6v33318POuigWmutRx11VL366qv79vWlL32pnnXWWX/GM/UHSebUVWwc93ABAJkwYUK23XbbvO1tb8vZZ5+dYcOGZdq0abn55ptzxBFHZNKkSUmSjTbaKOuvv36S5H3ve19OOOGEXH311Vm6dGmSZLvttsuoUaMybty4JMnJJ5+crq6uJMk222yTJUuWJEkOOOCAXHvttZk8eXIuuOCCTJkype9q15lnnpnrrrsuEyZMyO6775711lsvv//975Mk8+bNy0477ZRHH3005557br785S9nyy23zOOPP54k+cIXvpAJEyb0/Vz77LNP7rjjjtanb6UEFwCQJDnjjDNy2WWX5Yorrkiy/B6qZcuWZfvtt1/h+tttt11mz56diRMnJlkeQz09PZk1a1aeeeaZJMs/C+vFDBs2LF1dXZkxY0aGDRuWnp6eJMmNN96YSy65JJdeemkeeOCB7LXXXvnxj3+cJHnVq16VBQsW5Prrr8+hhx6axYsXp9ba987FP/6A0+HDh2fZsmUDOCt/HoILAEiSLFq0KJMnT+67OtVfJ598ck499dRssMEGeeqpp/q17fPDaPbs2Zk6dWpfPL3jHe/I2972tiTJtttum1//+tfZe++9853vfCfDhg1LKeVlzbs6CS4AIEkycuTI/OhHP8oxxxzT721nz56drq6ujBo1ql/b3X333Rk3blwuu+yyvmXPPvtsjjnmmBx22GF9LzM+9+7EZPlVrLvuuisbbbRRPve5z73ovv/xH/8xf/M3f9PPn6QNwQUAJEk22GCD/OQnP+l7fOutt2bjjTdepW2/9a1vZfr06f0+5nnnnZfTTjstY8aM6Xsn4YgRI/KFL3whI0aMyD333LPCOU866aScddZZLwix51u8eHF++tOf5mtf+1rn36EYHwsBAPT61Kc+laOPPjpLly5NrTV/+Zd/ma985SurtO0hhxzyovHzUiZPnpxp06Zls8026/ubh/vvv3+mTp2aV7/61dlqq61WuN3HP/7xl/yD1CNHjsyrXvWqHHrooTn11FP7PdefW3nuUt1g0N3dXefMmdPpMQAAVqqUckuttXtV1vWSIgBAY4ILAFh73H5+ctqOyYwNl/97+/mdniiJe7gAgLXF7ecnFx+bLF7+GWBZ8Ovlj5Ok6+DOzRVXuACAtcUPPvWH2HrO4meWL++w5le4Sin3J/m/SZYmWbKqN5cBAPTLggf7t3w1Wl0vKY6rtT62mo4FAAxFG2y+/GXEFS3vMC8pAgBrhwmfSEa+4oXLRr5i+fIOWx3BVZNcWUq5pZQybTUcDwAYiroOTiafkWzw+iRl+b+Tz+j4DfPJ6nlJcXSt9aFSymuSXFVKmVdr/eFzT/ZG2LQk2WKLLVbDOADAWqvr4EERWH+s+RWuWutDvf8+muTCJG/5o+fPqbV211q7n/tIfwCAtUnT4CqlrF9K+cvnvk+yd5I7Wx4TAGCwaf2S4muTXFhKee5Y/1VrvbzxMQEABpWmwVVrvTfJTi2PAQAw2PlYCACAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQWPPgKqVMLKXcVUq5u5Ty8dbHAwAYbJoGVylleJIvJXlHkjckeW8p5Q0tjwkAMNi0vsL1liR311rvrbUuSjIryf6NjwkAMKi0Dq7XJfn18x4/2LsMAGDI6PhN86WUaaWUOaWUOT09PZ0eBwDgz651cD2U5PXPe7x577I+tdZzaq3dtdbuTTbZpPE4AACrX+vg+p8k25ZSti6lrJPkPUm+1/iYAACDyoiWO6+1Liml/EOSK5IMT3JurfWnLY8JADDYNA2uJKm1fj/J91sfBwBgsOr4TfMAAGs7wQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADTWLLhKKTNKKQ+VUm7r/Xpnq2MBAAxmIxrv/7Ra6+caHwMAYFDzkiIAQGOtg+sfSim3l1LOLaW8qvGxAAAGpQEFVynl6lLKnSv42j/Jl5P8P0l2TjI/yedfZB/TSilzSilzenp6BjIOAMCgVGqt7Q9SylZJLqm17vhS63V3d9c5c+Y0nwcAYKBKKbfUWrtXZd2W71Lc9HkP35XkzlbHAgAYzFq+S/HfSik7J6lJ7k9yRMNjAQAMWs2Cq9b6vlb7BgBYk/hYCACAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQ2ICCq5RyUCnlp6WUZaWU7j967sRSyt2llLtKKfsMbEwAgDXXiAFuf2eSA5Oc/fyFpZQ3JHlPkjcm2SzJ1aWUv6q1Lh3g8QAA1jgDusJVa/15rfWuFTy1f5JZtdZna633Jbk7yVsGciwAgDVVq3u4Xpfk1897/GDvMgCAIWelLymWUq5OMmoFT51Ua71ooAOUUqYlmZYkW2yxxUB3BwAw6Kw0uGqte72M/T6U5PXPe7x577IV7f+cJOckSXd3d30ZxwIAGNRavaT4vSTvKaWsW0rZOsm2SW5udCwAgEFtoB8L8a5SyoNJdk1yaSnliiSptf40yflJfpbk8iQf9A5FAGCoGtDHQtRaL0xy4Ys89y9J/mUg+wcAWBv4pHkAgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcNDNr1qyMGTMmZ511VpJk2bJl+cAHPtDhqQBg9RNcNHPVVVdl7Nixue222/Lkk09m+vTpGT16dKfHAoDVTnDRzMknn5zf/OY3+cUvfpEpU6ZkzJgxrnABMCSN6PQAtLVkyZKMGNGZ/5g322yznHPOOR05NgAMJq5wNTZz5syMHz8+48ePz6677prPfOYzSZInnngiBx10UMaOHZvddtst11xzTZLk5ptvTnd3dw4++OC+fZx77rnZZZddcuqppyZJHnjggbzzne/M2LFjs+eee2bu3LlJkp6enowbNy5jx47N2LFjM2HChFx00UV9+znvvPMybty47Lnnnpk4cWLmzZvX99xVV12VvffeO+PHj8+ee+6Z6dOnZ9GiRUmSxYsX5wMf+EDGjh2bt771rZk1a9YLfsbvfve7edOb3pQjjzwySbJo0aIcccQR2XPPPTN+/PgceuihefLJJ5MkX/7yl7PLLrvkX//1X1+wjyOPPDL333//gM83AAxKtdZB8/XmN7+5rk2uvvrqOnPmzL7HS5curccff3z96le/WqdPn14vueSSWmutjz/+eB09enSttdaTTjqp/vjHP65Tpkypjz76aK211n333bcuXLiwHnjggbXWWt/97nfXuXPn1lprveeee+qkSZNqrbVeeOGFdY899qif/vSn6x133PGCWb7//e/Xf/zHf6xLly6ttdY6d+7c+ta3vrXWWuv8+fPrYYcdVp988sm+9b/zne/UE088sdZa6+mnn17PPPPMWmutzzzzTH3LW95SFy9e3LfulClT6vz58+vb3/72Wmutp556av3KV77S9/yll15aTzjhhFprrRMnTqyLFi2qe++99wvmO+KII+p9993Xj7MLAJ2VZE5dxcZxhauhCRMmZNttt83b3va2nH322Rk2bFimTZuWm2++OUcccUQmTZqUJNloo42y/vrrJ0ne97735YQTTsjVV1+dpUuXJkm22267jBo1KuPGjUuy/N6orq6uJMk222yTJUuWJEkOOOCAXHvttZk8eXIuuOCCTJkyJct/H5ZfFfvNb36TG264IUnS1dWVrbbaKo899lhGjRqV//iP/8hnPvOZjBs3Lvvuu28WLFiQ2267LUmy3377ZerUqUmS9dZbLzvssEMef/zxvp/zqKOOyoEHHpjbb789SXLccce94F6tffbZJ3fccUeS5PDDD88ee+yRRx999M99ugFg0BJcjZ1xxhm57LLLcsUVVyRJ1llnnSxbtizbb7/9CtffbrvtMnv27EycODFJMm/evPT09GTWrFl55plnkiQ77LDDix5v2LBh6erqyowZMzJs2LD09PQkSd773vfmoYceyhve8Ia+dTfccMO+l/pOOeWUbL311rn22mtz0UUX5bHHHsvTTz+dJNl6662z7rrrvuA4tdY8/PDDWbBgQXbffffccMMN2XnnnZPkT9YdPnx4li1bliQ58MADc+ONN+a1r33tKpw9AFg7CK7GFi1alMmTJ/ddneqvk08+Oaeeemo22GCDPPXUU/3a9vmhM3fu3Oy6667ZaKONVrjujTfe2HdVavjw4fnQhz6UddZZ5yX3/+yzz2bq1Kl90TYQ8+bNyytf+coB7wcABiPB1djIkSPzox/9KMccc0y/t509e3a6uroyatSofm139913Z9y4cbnsssv6li1duvQl3624cOHC7L777vniF7+YZHl0DRv20r8eW2+9dWbMmJH3v//9Wbhw4QrXOeuss7Lbbru96D4ee+yxHHXUUbn//vuz3nrrveTxAGBNJbga22CDDfKTn/yk7/Gtt96ajTfeeJW2/da3vpXp06f3+5jnnXdeTjvttIwZMybDhw9fpW3WXXfd/OAHP8i3v/3tfh3rjW98Y0444YQceuihfe9qfL7vf//7ueyyyzJy5MgXLF+6dGlmzJiR9773vXnwwQfz9a9/PTNnzuzXsQFgTSG4GvvUpz6Vj370o30f4fCVr3wlxx577Cpte8ghh/zJ/VCrYvLkyZk2bVqWLFmSTTbZZJW2OfjggzN69OiXvBr1Yv76r/86Rx11VG699dY/eW7nnXfOuHHjXvAxF0nyy1/+MsOGDctVV12V9ddfP8cff3x23XXXfh8bANYE5bl3sQ0G3d3ddc6cOZ0eAwBgpUopt9Rau1dlXVe4AAAaE1ysdgsuvji/HD8hP9/hDfnl+AlZcPHFnR4JAJrytxRZrRZcfHHm/9MnUnvf1bjk4Ycz/58+kSTZYPLkTo4GAM24wsVq9ehpp/fF1nPqwoV59LTTOzQRALQnuFitlsyf36/lALA2EFysViM23bRfywFgbSC4WK1e86HpKX/0ifJlvfXymg/1/wNeAWBN4aZ5Vqvnbox/9LTTs2T+/IzYdNO85kPT3TAPwFpNcLHabTB5ssACYEjxkiIAQGOCCwCgMcEFANCY4AIAaExwAQA0JrgAABoTXAAAjQkuAIDGBBcAQGOCCwCgMcEFANCY4AIAaExwAQA0JrgAABoTXAAAjQkuAIDGBBcAQGOCCwCgMcEFANCY4AIAaExwAQA0JrgAABoTXAAAjQkuAIDGBBcAQGOCCwCgMcEFANCY4AIAaExwAQA0JrgAABoTXAAAjQkuAIDGBBcAQGOCCwCgMcEFANCY4AIAaExwAQA0JrgAABoTXAAAjQ0ouEopB5VSflpKWVZK6X7e8q1KKc+UUm7r/fr3gY8KALBmGjHA7e9McmCSs1fw3D211p0HuH8AgDXegIKr1vrzJCml/HmmAQBYC7W8h2vrUsqtpZT/LqWMaXgcAIBBbaVXuEopVycZtYKnTqq1XvQim81PskWt9fFSypuTfLeU8sZa6+9XsP9pSaYlyRZbbLHqkwMArCFWGly11r36u9Na67NJnu39/pZSyj1J/irJnBWse06Sc5Kku7u79vdYAACDXZOXFEspm5RShvd+v02SbZPc2+JYAACD3UA/FuJdpZQHk+ya5NJSyhW9T+2R5PZSym1JLkhyZK31twMbFQBgzTTQdylemOTCFSz/dpJvD2TfAABrC580DwDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOBaiW9+85vZa6+9ct999yVJzjjjjA5PBACsaQb0p32Gglpr7rvvvjz88MP5n//5nyxatKjTIwEAaxhXuFbiVa96VdZff/1897vfzSOPPJKPfOQjnR4JAFjDuMK1Em9/+9vzjne8o9NjAABrMFe4VmLECE0KAAyM4AIAaExwAQA0NqSDa8mSJZ0eAQAYAjoaXKWU40op1/R+3Th//vwkyRNPPJGDDjooY8eOzW677ZZrrrkmSXLzzTenu7s7Bx98cN8+zj333Oyyyy459dRTkyQPPPBA3vnOd2bs2LHZc889M3fu3CRJT09Pxo0bl7Fjx2bs2LGZMGFCLrrooiTJjBkzctNNN/Xt8/TTT8/48eOzyy675Oyzz06SnHjiibnuuuuSJDNnzszo0aOz66675mMf+1jjswQArOk6dkd4KWVCklprHd/7eNjSpUuXfu1rX8vcuXNz2GGHZdKkSfntb3+b/fffP+PHj8/3vve9nHXWWTnjjDPS09OTTTbZJBdeeGFuvPHG/N3f/V2S5MMf/nBOOeWUdHV15d57782xxx6bSy65JNdff32WLVuWvffeO/vvv3923HHHFc71q1/9Kp/85Cez995756KLLsrb3/72dHd358EHH8zo0aNz22235YYbbsh1112XUkoOP/zw3HDDDdltt91W27kDANYsHbvCVWv9QZJfllJuKqUcUWtd9upXvzo333xzjjjiiEyaNClJstFGG2X99ddPkrzvfe/LCSeckKuvvjpLly5Nkmy33XYZNWpUxo0blyQ5+eST09XVlSTZZptt+l42POCAA3Lttddm8uTJueCCCzJlypTUWv9krscffzxHHnlkJk6cmF/+8pfZfvvt87vf/S6llJRS8vrXvz5nnnlmSilJkt133z2//vWvm54rAGDN1ul7uI5N8o4k+yRJKSXLli3L9ttvv8KVt9tuu8yePTsTJ05MksybNy89PT2ZNWtWnnnmmSTJDjvs8KIHGzZsWLq6ujJjxowMGzYsPT09f7LOzjvvnF/96le5/PLLM3v27Bx66KHZa6+98sgjjyRJNt5442yyySYv2GZF4QYA8JxOf8jUOkkuTvKtl7PxySefnM9//vO5//7789RTT/Vr2+HDh2fZsmV/snzYsGGZNWtWkmTSpEn50Ic+1LccAODl6HRFLK61jq61frG/G86ePTtdXV0ZNWpUv7a7++67M27cuFx22WUvWH7kkUdm6tSpL7hatXTp0r6XDgEAXq5OB9eCUsouzz14+umns/HGG6/Sht/61rcyffr0fh/wvPPOy2mnnZYxY8Zk+PDhSZZf7fr4xz+eUaNG5Z577ulbd+TIkfn973+fJFm4cGG/jwUAkHT+JcVPJDmrlDI8Sdlggw1y7LHHrtKGhxxySNZdd91+H3Dy5MmZNm1aNttss757sSZPnpz3v//92XLLLbPVVlv1rXvUUUdlr732yogRI3LAAQf0+1gAAElSBtMN393d3XXOnDmdHgMAYKVKKbfUWrtXZd1OX+Fa43z31ofy2SvuysNPPJPNNnxFPrrPdjngTa/r9FgAwCAmuPrhu7c+lBO/c0eeWbz8M8AeeuKZnPidO5JEdAEAL6rTN82vUT57xV19sfWcZxYvzWevuKtDEwEAawLB1Q8PP/FMv5YDACSCq1822/AV/VoOAJAIrn756D7b5RUjh79g2StGDs9H99muQxMBAGsCN833w3M3xnuXIgDQH4Krnw540+sEFgDQL15SBABoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQmuAAAGhNcAACNCS4AgMYEFwBAY4ILAKAxwQUA0JjgAgBoTHABADQ2oOAqpXy2lDKvlHJ7KeXCUsqGz3vuxFLK3aWUu0op+wx8VACANdNAr3BdlWTHWmtXkl8kOTFJSilvSPKeJG9MMjHJWaWU4QM8FgDAGmlAwVVrvbLWuqT34U1JNu/9fv8ks2qtz9Za70tyd5K3DORYAABrqj/nPVwfSHJZ7/evS/Lr5z33YO8yAIAhZ8TKViilXJ1k1AqeOqnWelHvOiclWZLkG/0doJQyLcm0JNliiy36uzkAwKC30uCqte71Us+XUg5Lsm+SCbXW2rv4oSSvf95qm/cuW9H+z0lyTpJ0d3fXFa0DALAmG+i7FCcmOSHJfrXWp5/31PeSvKeUsm4pZesk2ya5eSDHAgBYU630CtdKnJlk3SRXlVKS5KZa65G11p+WUs5P8rMsf6nxg7XWpQM8FgDAGmlAwVVr/V8v8dy/JPmXgewfAGBt4JPmAQAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgD74zmYAAALh0lEQVSAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAaE1wAAI0JLgCAxgQXAEBjggsAoDHBBQDQmOACAGhsSATXs88+m4985COdHgMAGKJGdHqAVi6++OJ87nOfSyklS5Ysycc+9rFOjwQADFFrbXCdfvrpufzyy/OKV7yi06MAAEPcWvuS4nrrrZcnn3wyPT09mTJlSj760Y92eiQAYIhaa69wHXPMMbnhhhsycuTIPP3003nNa17T6ZEAgCGq1Fo7PUOf7u7uOmfOnFVef8mSJRkxYq1tRgBgECul3FJr7V6VdQf0kmIp5bhSyjW9XzeWUk7sXb5hKeX/lFJml1JuKKWM713+llLKnFLK+c/bxwdKKT8ppXwsSR544IG8853vzNixY7Pnnntm7ty5SZKenp6MGzcuY8eOzdixYzNhwoRcdNFFSZIZM2bkpptu6pvrlltuyZgxYzJ27Ng89thjSZKJEycmSb73ve9ll112yT/8wz+84Gc55ZRTMnv27IGcDgCAFXrZl4dKKROS1FrrczE1LMm/lVIOS7JTkq/VWi8tpWyU5KIk1yTZL8nRSY4tpWxSa+1J8q4kuyb5ryT58Ic/nFNOOSVdXV259957c+yxx+aSSy7J9ddfn2XLlmXvvffO/vvvnx133PFFZzv33HOz00475f77788666zzgufOP//8XH755fn7v//7l/ujAwD0y8u+wlVr/UGSX5ZSbiqlHFFrXZbknCRvSXJ2rfXS3vV+m+Sp3s3+M8m/JdkryfDeZXcleSTJtUly8sknp6urK0myzTbbZMmSJUmSAw44INdee20mT56cCy64IFOmTMmLvRy6xRZbZI899sgll1ySV77ylS947qijjsq73vWu3HHHHS/3RwcA6JeBvkvx2CTvSLJP7+NFSYbVWuetaOVa61211rFJLk+SUsr2STZJ8p4kr0iSHXbY4cWHHTYsXV1dmTFjRoYNG5aenp4VrnfMMcfkxBNPzJgxY3LppZe+4Lndd989119/fXbaaadV/ykBAAZgoMG1TpKL03t16mX4pyQfS7Igyfr92XD48OFZtmzZCp+bNWtWTjrppFx33XWZNGnSKu1v3rx5f3I1DADgz2GgwbW41jq61vrFl7Ht2CS311of6c9Gd999d8aNG5fLLrvsBcuPPPLITJ06NbXWPPDAA3nDG96wSvt78skn88lPfjLf/va3s/nmm/dnFACAVTLQ4FpQStnleY/flOTxVdz2b5Oc3t8DnnfeeTnttNMyZsyYDB++/Daw4cOH5+Mf/3hGjRqVe+65J5tuumluvvnmvm3mz5+fRYsW/cm+vvSlL+WAAw7IlVdemVtvvTWf+MQn+jsOAMBKDfRDrD6R5KxSyvAkJcn/TfL/ruK2/1+t9dn+HnDy5MmZNm1aNttss2yyySZ9y97//vdnyy23zFZbbZVDDjkk06ZNyx577NH30uMpp5zygv0sXLgwt912W6644op85CMfyXve854cf/zx/R0HAGCl1ugPPgUA6JTV9sGnAACs3JANrkvvvTR7X7B3ur7elb0v2DuX3nvpyjcCAHgZhuQfIrz03ksz44YZWbh0YZJk/lPzM+OGGUmSSdus2sdIAACsqiF5hWvmT2b2xdZzFi5dmJk/mdmhiQCAtdmQDK5HnlrxR3+92HIAgIEYksE1av1R/VoOADAQQzK4jtvluKw3fL0XLFtv+Ho5bpfjOjQRALA2G5I3zT93Y/zMn8zMI089klHrj8pxuxznhnkAoIkhGVzJ8ugSWADA6jAkX1IEAFidBBcAQGOCCwCgMcEFANCY4AIAaExwAQA0JrgAABoTXAAAjQkuAIDGBBcAQGOCCwCgMcEFANCY4AIAaGxAwVVK+WwpZV4p5fZSyoWllA17l29VSnmmlHJb79e//3nGBQBY8wz0CtdVSXastXYl+UWSE5/33D211p17v44c4HEAANZYAwquWuuVtdYlvQ9vSrL5wEcCAFi7/Dnv4fpAksue93jrUsqtpZT/LqWM+TMeBwBgjTJiZSuUUq5OMmoFT51Ua72od52TkixJ8o3e5+Yn2aLW+ngp5c1JvltKeWOt9fcr2P+0JNOSZIsttnh5PwUAwCC20uCqte71Us+XUg5Lsm+SCbXW2rvNs0me7f3+llLKPUn+KsmcFez/nCTnJEl3d3ft5/wAAIPeQN+lODHJCUn2q7U+/bzlm5RShvd+v02SbZPcO5BjAQCsqVZ6hWslzkyybpKrSilJclPvOxL3SPKpUsriJMuSHFlr/e0AjwUAsEYaUHDVWv/Xiyz/dpJvD2TfAABri9J729WgUErpSfKr1XjIVyd5bDUebzBzLpZzHv7AuVjOeVjOefgD52I55yHZsta6yaqsOKiCa3UrpcyptXZ3eo7BwLlYznn4A+diOedhOefhD5yL5ZyH/vG3FAEAGhNcAACNDfXgOqfTAwwizsVyzsMfOBfLOQ/LOQ9/4Fws5zz0w5C+hwsAYHUY6le4AACaG7LBVUq5v5RyRynltlLKn/zJoaGilLJhKeWCUsq8UsrPSym7dnqmTiilbNf7u/Dc1+9LKdM7PVcnlFI+VEr5aSnlzlLKN0sp63V6pk4opRzXew5+OtR+F0op55ZSHi2l3Pm8ZRuVUq4qpfyy999XdXLG1eVFzsVBvb8Xy0opQ+Jdei9yHj7b+78dt5dSLiylbNjJGQe7IRtcvcbVWnce4m9rnZnk8lrr9kl2SvLzDs/TEbXWu3p/F3ZO8uYkTye5sMNjrXallNclOTZJd611xyTDk7yns1OtfqWUHZMcnuQtWf7fi31LKSv8oOe11NeSTPyjZR9P8oNa67ZJftD7eCj4Wv70XNyZ5MAkP1zt03TO1/Kn5+GqJDvWWruS/CLJiat7qDXJUA+uIa2UskGW/xmmryRJrXVRrfWJzk41KExIck+tdXV+CO9gMiLJK0opI5L8RZKHOzxPJ+yQ5Me11qdrrUuS/HeW/w/skFBr/WGSP/5zbPsn+Xrv919PcsBqHapDVnQuaq0/r7Xe1aGROuJFzsOVvf/9SJKbkmy+2gdbgwzl4KpJriyl3FJKmdbpYTpk6yQ9Sb5aSrm1lPK/Synrd3qoQeA9Sb7Z6SE6odb6UJLPJXkgyfwkC2qtV3Z2qo64M8mYUsrGpZS/SPLOJK/v8Eyd9tpa6/ze7x9J8tpODsOg84Ekl3V6iMFsKAfX6FrrLknekeSDpZQ9Oj1QB4xIskuSL9da35TkqQydlwlWqJSyTpL9kvyfTs/SCb335eyf5TG+WZL1Syl/39mpVr9a68+TnJrkyiSXJ7ktydKODjWI1OVvb/cWd5IkpZSTkixJ8o1OzzKYDdng6v1/8qm1Pprl9+q8pbMTdcSDSR6stf649/EFWR5gQ9k7kvyk1vqbTg/SIXslua/W2lNrXZzkO0l26/BMHVFr/Uqt9c211j2S/C7L71EZyn5TStk0SXr/fbTD8zAIlFIOS7JvkinV50y9pCEZXKWU9Uspf/nc90n2zvKXEIaUWusjSX5dStmud9GEJD/r4EiDwXszRF9O7PVAkreVUv6ilFKy/HdiSL6RopTymt5/t8jy+7f+q7MTddz3khza+/2hSS7q4CwMAqWUiUlOSLJfrfXpTs8z2A3JDz4tpWyTP7wDbUSS/6q1/ksHR+qYUsrOSf53knWS3Jvk/bXW33V2qs7oje8HkmxTa13Q6Xk6pZTyz0n+NstfIrg1ydRa67OdnWr1K6Vcl2TjJIuTfLjW+oMOj7TalFK+mWRsklcn+U2STyb5bpLzk2yR5FdJDq61/vGN9WudFzkXv03yxSSbJHkiyW211n06NePq8CLn4cQk6yZ5vHe1m2qtR3ZkwDXAkAwuAIDVaUi+pAgAsDoJLgCAxgQXAEBjggsAoDHBBQDQmOACAGhMcAEANCa4AAAa+/8BNXXVH2VmNl4AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sample_words = \"อาหารเช้า อาหารสัตว์ อาหารเย็น อาหารกลางวัน\".split()\n", + "sample_idx = []\n", + "for word in sample_words:\n", + " sample_idx.append(labels.index(word))\n", + "sample_plot = thai2plot[sample_idx]\n", + "plot_with_labels(sample_plot,sample_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T09:03:55.915881Z", + "start_time": "2018-01-25T09:03:55.811980Z" + }, + "hidden": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'ดัน'" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.doesnt_match(\"ลาก ดึง ดูด ดัน\".split())" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T09:03:57.111707Z", + "start_time": "2018-01-25T09:03:56.766901Z" + }, + "hidden": true + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sample_words = \"ลาก ดึง ดูด ดัน\".split()\n", + "sample_idx = []\n", + "for word in sample_words:\n", + " sample_idx.append(labels.index(word))\n", + "sample_plot = thai2plot[sample_idx]\n", + "plot_with_labels(sample_plot,sample_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T09:00:04.040459Z", + "start_time": "2018-01-25T09:00:03.917290Z" + }, + "hidden": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'หมอ'" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.doesnt_match(\"แมว หมา หมู หมอ\".split())" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T09:00:23.186752Z", + "start_time": "2018-01-25T09:00:22.875191Z" + }, + "hidden": true + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sample_words = \"แมว หมา หมู หมอ\".split()\n", + "sample_idx = []\n", + "for word in sample_words:\n", + " sample_idx.append(labels.index(word))\n", + "sample_plot = thai2plot[sample_idx]\n", + "plot_with_labels(sample_plot,sample_words)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cosine Similarity" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T08:49:26.327610Z", + "start_time": "2018-01-25T08:49:26.219002Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "China + Beijing: 0.3135956\n", + "Italy + Rome: 0.42819628\n", + "Beijing + Rome: 0.27347285\n", + "Italy + Beijing: 0.17900795\n", + "China + Rome: 0.02666693\n", + "China + Italy: 0.24352394\n" + ] + } + ], + "source": [ + "print('China + Beijing:', model.similarity('ปักกิ่ง', 'จีน'))\n", + "print('Italy + Rome:', model.similarity('โรม','อิตาลี'))\n", + "print('Beijing + Rome:', model.similarity('โรม', 'ปักกิ่ง'))\n", + "print('Italy + Beijing:', model.similarity('ปักกิ่ง', 'อิตาลี'))\n", + "print('China + Rome:', model.similarity('โรม','จีน'))\n", + "print('China + Italy:', model.similarity('อิตาลี','จีน'))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "ExecuteTime": { + "end_time": "2018-01-25T08:55:58.556210Z", + "start_time": "2018-01-25T08:55:58.223988Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sample_words = \"ปักกิ่ง จีน โรม อิตาลี โตเกียว ญี่ปุ่น\".split()\n", + "sample_idx = []\n", + "for word in sample_words:\n", + " sample_idx.append(labels.index(word))\n", + "sample_plot = thai2plot[sample_idx]\n", + "plot_with_labels(sample_plot,sample_words)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spellchecking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Originally contributed by [Sakares ATV](https://github.com/sakares), adapted from [Kaggle Spell Checker using Word2vec by CPMP](https://www.kaggle.com/cpmpml/spell-checker-using-word2vec)." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "words = model.index2word\n", + "\n", + "w_rank = {}\n", + "for i,word in enumerate(words):\n", + " w_rank[word] = i\n", + "\n", + "WORDS = w_rank" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "thai_letters = 'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤฤๅลฦฦๅวศษสหฬอฮะัาำิีึืุูเแโใไ็่้๊๋์'\n", + "\n", + "def words(text): return re.findall(r'\\w+', text.lower())\n", + "\n", + "def P(word): \n", + " \"Probability of `word`.\"\n", + " # use inverse of rank as proxy\n", + " # returns 0 if the word isn't in the dictionary\n", + " return - WORDS.get(word, 0)\n", + "\n", + "def correction(word): \n", + " \"Most probable spelling correction for word.\"\n", + " return max(candidates(word), key=P)\n", + "\n", + "def candidates(word): \n", + " \"Generate possible spelling corrections for word.\"\n", + " return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])\n", + "\n", + "def known(words): \n", + " \"The subset of `words` that appear in the dictionary of WORDS.\"\n", + " return set(w for w in words if w in WORDS)\n", + "\n", + "def edits1(word):\n", + " \"All edits that are one edit away from `word`.\"\n", + " letters = thai_letters\n", + " splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n", + " deletes = [L + R[1:] for L, R in splits if R]\n", + " transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]\n", + " replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n", + " inserts = [L + c + R for L, R in splits for c in letters]\n", + " return set(deletes + transposes + replaces + inserts)\n", + "\n", + "def edits2(word): \n", + " \"All edits that are two edits away from `word`.\"\n", + " return (e2 for e1 in edits1(word) for e2 in edits1(e1))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'พัฒนา'" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "correction('พัดนา')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'จริง'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "correction('ขริง')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'จ้า'" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "correction('จย้า')" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'นะคะ'" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "correction('นะค่ะ')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pythainlp/benchmarks/__init__.py b/pythainlp/benchmarks/__init__.py index 711404da2..181e8b775 100644 --- a/pythainlp/benchmarks/__init__.py +++ b/pythainlp/benchmarks/__init__.py @@ -1,3 +1,3 @@ -from .word_tokenisation import benchmark +from .word_tokenization import benchmark __all__ = ["benchmark"] \ No newline at end of file diff --git a/pythainlp/benchmarks/word_tokenisation.py b/pythainlp/benchmarks/word_tokenization.py similarity index 86% rename from pythainlp/benchmarks/word_tokenisation.py rename to pythainlp/benchmarks/word_tokenization.py index 75e074b84..ee3c21ae9 100644 --- a/pythainlp/benchmarks/word_tokenisation.py +++ b/pythainlp/benchmarks/word_tokenization.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- -import sys import re +import sys import numpy as np import pandas as pd + SEPARATOR = "|" # regex for removing to a space surrounded by separators, i.e. | | @@ -68,7 +69,7 @@ def benchmark(ref_samples: list, samples: list): """ Performace benchmark of samples - Please see :meth:`pythainlp.benchmarks.word_tokenisation.compute_stats` for + Please see :meth:`pythainlp.benchmarks.word_tokenization.compute_stats` for metrics being computed. :param list[str] ref_samples: ground truth samples @@ -173,26 +174,25 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: c_f1 = _f1(c_precision, c_recall) # Compute word-level statistics - word_boundaries = _find_word_boudaries(ref_sample) - - correctly_tokenised_words = _count_correctly_tokenised_words( - sample, - word_boundaries - ) - w_precision = correctly_tokenised_words / np.sum(sample) - w_recall = correctly_tokenised_words / np.sum(ref_sample) - w_f1 = _f1(w_precision, w_recall) + # Find correctly tokenized words in the reference sample + word_boundaries = _find_word_boudaries(ref_sample) # Find correctly tokenized words in the sample ss_boundaries = _find_word_boudaries(sample) - tokenisation_indicators = _find_words_correctly_tokenised( + tokenization_indicators = _find_words_correctly_tokenised( word_boundaries, ss_boundaries ) - tokenisation_indicators = list( - map(lambda x: str(x), tokenisation_indicators) + correctly_tokenised_words = np.sum(tokenization_indicators) + + w_precision = correctly_tokenised_words / np.sum(sample) + w_recall = correctly_tokenised_words / np.sum(ref_sample) + w_f1 = _f1(w_precision, w_recall) + + tokenization_indicators = list( + map(lambda x: str(x), tokenization_indicators) ) return { @@ -211,7 +211,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: 'f1': w_f1 }, 'global': { - 'tokenisation_indicators': "".join(tokenisation_indicators) + 'tokenisation_indicators': "".join(tokenization_indicators) } } @@ -267,29 +267,6 @@ def _find_word_boudaries(bin_reps) -> list: return list(zip(start_idx, end_idx)) -def _count_correctly_tokenised_words(bin_reps, word_boundaries) -> list: - """ - Count how many words are tokenized correctly - - :param str bin_reps: binary representation of a text - :param list[tuple(int, int)] word_boundaries: list of when each word starts and ends - - :return: no. correctly tokenized words - :rtype: int - """ - count = 0 - for st, end in word_boundaries: - pend = min(end, bin_reps.shape[0]) - if (bin_reps[st] == 1 and np.sum(bin_reps[st+1:pend]) == 0) \ - and ( - (pend == bin_reps.shape[0]) or - (pend != bin_reps.shape[0] and bin_reps[pend] == 1) - ): - count = count + 1 - - return count - - def _find_words_correctly_tokenised( ref_boundaries: list, predicted_boundaries: list diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index d480768fc..b6690018d 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -94,7 +94,7 @@ def get_ner( ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ This function tags named-entitiy from text in IOB format. - + :param string text: text in Thai to be tagged :param boolean pos: To include POS tags in the results (`True`) or exclude (`False`). The defualt value is `True` diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 2db648f91..95170eb4d 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -20,7 +20,6 @@ def word_tokenize( ) -> List[str]: """ This function tokenizes running text into words. - :param str text: text to be tokenized :param str engine: name of the tokenizer to be used :param marisa_trie.Trie custom_dict: marisa dictionary trie @@ -29,7 +28,6 @@ def word_tokenize( Otherwise, whitespaces are omitted. :return: list of words :rtype: list[str] - **Options for engine** * *newmm* (default) - dictionary-based, Maximum Matching + Thai Character Cluster @@ -39,19 +37,15 @@ def word_tokenize( language-model-based * *icu* - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based - + * attacut - Wrapper for `AttaCut (https://github.com/PyThaiNLP/attacut)` .. warning:: * the option for engine named *ulmfit* has been deprecated since \ PyThaiNLP version 2.1 - :Note: - The parameter **custom_dict** can be provided as an argument \ only for *newmm*, *longest*, and *deepcut* engine. - :Example: - Tokenize text with different tokenizer: - >>> from pythainlp.tokenize import word_tokenize >>> >>> text = "โอเคบ่พวกเรารักภาษาบ้านเกิด" @@ -69,9 +63,10 @@ def word_tokenize( >>> >>> word_tokenize(text, engine="ulmfit") ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] - + >>> tokenize.word_tokenize('โอเคบ่พวกเรารักภาษาบ้านเกิด', engine='attacut') + ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] + >>> Tokenize text by omitiing whitespaces: - >>> from pythainlp.tokenize import word_tokenize >>> >>> text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว " @@ -79,9 +74,7 @@ def word_tokenize( ['วรรณกรรม', ' ', 'ภาพวาด', ' ', 'และ', 'การแสดง', 'งิ้ว', ' '] >>> word_tokenize(text, engine="newmm", keep_whitespace=False) ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว'] - Tokenize with default and custom dictionary: - >>> from pythainlp.corpus.common import thai_words >>> from pythainlp.tokenize import dict_trie, word_tokenize >>> @@ -123,6 +116,10 @@ def word_tokenize( elif engine == "icu": from .pyicu import segment + segments = segment(text) + elif engine == 'attacut': + from .attacut import segment + segments = segment(text) else: # default, use "newmm" engine from .newmm import segment @@ -134,7 +131,6 @@ def word_tokenize( return segments - def dict_word_tokenize( text: str, custom_dict: Trie = DEFAULT_DICT_TRIE, @@ -171,22 +167,17 @@ def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found. - :param str text: the text to be tokenized :param str engine: choose between *'whitespace'* or *'whitespace+newline'* :return: list of splited sentences :rtype: list[str] - **Options for engine** * *whitespace+newline* (default) - split by whitespace token \ and newline. * *whitespace* - split by whitespace token. Specifiaclly, with \ :class:`regex` pattern ``r" +"`` - :Example: - Split the text based on *whitespace* - >>> from pythainlp.tokenize import sent_tokenize >>> sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม" >>> sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\ @@ -196,9 +187,7 @@ def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: >>> sent_tokenize(sentence_2, engine="whitespace") ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ', '\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค'] - Split the text based on *whitespace* and *newline* - >>> from pythainlp.tokenize import sent_tokenize >>> sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม" >>> sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\ @@ -229,30 +218,23 @@ def subword_tokenize(text: str, engine: str = "tcc") -> List[str]: Thai contiguous characters namely `Thai Character Clusters (TCCs) \ `_ - TCCs are the units based on Thai spelling feature that could not be separated any character further such as 'ก็', 'จะ', 'ไม่', and 'ฝา'. If the following units are separated, they could not be spelled out. - This function apply the TCC rules to tokenizes the text into the smallest units. For example, the word 'ขนมชั้น' would be tokenized into 'ข', 'น', 'ม', and 'ชั้น' - :param str text: text to be tokenized :param str engine: the name subword tokenizer :return: list of subwords :rtype: list[str] - **Options for engine** * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) * *ssg* - CRF syllable segmenter for Thai. * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) [In development] - :Example: - Tokenize text into subword based on *tcc* - >>> from pythainlp.tokenize import subword_tokenize >>> text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" >>> text_2 = "ความแปลกแยกและพัฒนาการ" @@ -262,9 +244,7 @@ def subword_tokenize(text: str, engine: str = "tcc") -> List[str]: >>> subword_tokenize(text_2, engine='tcc') ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก', 'และ', 'พัฒ','นา', 'กา', 'ร'] - Tokenize text into subword based on *etcc* **(Work In Progress)** - >>> from pythainlp.tokenize import subword_tokenize >>> text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" >>> text_2 = "ความแปลกแยกและพัฒนาการ" @@ -291,7 +271,6 @@ def syllable_tokenize(text: str, engine: str = "default") -> List[str]: This function is to tokenize text into syllable (Thai: พยางค์), a unit of pronunciation having one vowel sound. For example, the word 'รถไฟ' contains two syallbles including 'รถ', and 'ไฟ'. - Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize` with *newmm* as a tokenizer. The function tokenize the text with the dictionary of Thai words from @@ -299,17 +278,13 @@ def syllable_tokenize(text: str, engine: str = "default") -> List[str]: and then dictionary of Thai syllable from :func:`pythainlp.corpus.common.thai_syllables`. As a result, only syllables are obtained. - :param str text: input string to be tokenized :return: list of syllables where whitespaces in the text **are included** :rtype: list[str] - **Options for engine** * *default* * *ssg* - CRF syllable segmenter for Thai. - :Example: - >>> from pythainlp.tokenize import syllable_tokenize >>> >>> text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า' @@ -340,7 +315,6 @@ def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: For more information on the trie data structure, see: `marisa-trie's Official Documentation \ `_ - :param string/list dict_source: a list of vocaburaries or a path to source file :return: a trie created from a dictionary input @@ -366,19 +340,16 @@ def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: return trie + class Tokenizer: """ This class allows users to pre-define custom dictionary along with tokenizer and encapsulate them into one single object. - It is an wrapper for both two functions including :func:`pythainlp.tokenize.word_tokenize`, and :func:`pythainlp.tokenize.dict_trie` - :Example: - Tokenizer object instantiated with :class:`marisa_trie.Trie` - >>> from pythainlp.tokenize import Tokenizer >>> from pythainlp.tokenize import Tokenizer, dict_trie >>> from pythainlp.corpus.common import thai_words @@ -392,9 +363,7 @@ class Tokenizer: >>> _tokenizer = Tokenizer(custom_dict=trie, engine='newmm') ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', 'ผิดปกติ', 'ของ', 'การ', 'พูด'] - Tokenizer object instantiated with a list of words - >>> from pythainlp.tokenize import Tokenizer >>> from pythainlp.corpus.common import thai_words >>> @@ -403,12 +372,9 @@ class Tokenizer: >>> _tokenizer.word_tokenize(text) ['อะ', 'เฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', 'ผิดปกติ', 'ของ', 'การ', 'พูด'] - - Tokenizer object instantiated with a file path containing list of word separated with *newline* and explicitly set a new tokeneizer after initiation. - >>> from pythainlp.tokenize import Tokenizer >>> >>> PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt' @@ -438,7 +404,6 @@ def __init__( ): """ Initialize tokenizer object - :param str: a file path, a list of vocaburaies* to be used to create a trie, or an instantiated :class:`marisa_trie.Trie` object. @@ -455,7 +420,6 @@ def __init__( def word_tokenize(self, text: str) -> List[str]: """ :param str text: text to be tokenized - :return: list of words, tokenized from the text :rtype: list[str] """ @@ -464,9 +428,8 @@ def word_tokenize(self, text: str) -> List[str]: def set_tokenize_engine(self, engine: str) -> None: """ Set the tokenizer - :param str engine: choose between different options of engine to token (i.e. *newmm*, *longest*, *deepcut*) - """ self.__engine = engine + diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py new file mode 100644 index 000000000..07e12c906 --- /dev/null +++ b/pythainlp/tokenize/attacut.py @@ -0,0 +1,15 @@ +""" +Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai +:See Also: + * `GitHub repository `_ +""" +from typing import List + +import attacut + + +def segment(text: str) -> List[str]: + if not text or not isinstance(text, str): + return [] + + return attacut.tokenize(text) diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 394444c85..1ae7aeab1 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -1,9 +1,7 @@ -# -*- coding: utf-8 -*- """ Wrapper for deepcut Thai word segmentation. deepcut is a Thai word segmentation library using Deep Neural, specifically, 1D Convolution Neural Network. - :See Also: * `GitHub repository `_ """ @@ -26,3 +24,4 @@ def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[ return deepcut.tokenize(text, custom_dict) return deepcut.tokenize(text) + diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py index 5e9f4d3f7..89dfbe2a8 100644 --- a/pythainlp/ulmfit/__init__.py +++ b/pythainlp/ulmfit/__init__.py @@ -5,8 +5,8 @@ """ import collections import re -from typing import List, Collection, Callable +from typing import List, Collection, Callable import emoji import html import numpy as np @@ -18,7 +18,8 @@ ''' # Fastai dependencies -The following codes are copied from copied from https://github.com/fastai/fastai/blob/master/fastai/text/transform.py +The following codes are copied from +https://github.com/fastai/fastai/blob/master/fastai/text/transform.py in order to avoid importing the entire fastai library ''' @@ -26,28 +27,67 @@ TK_REP = 'xxrep' TK_WREP = 'xxwrep' TK_END = 'xxend' +TK_URL ='xxurl' class BaseTokenizer(): - "Basic class for a tokenizer function." - def __init__(self, lang:str): self.lang = lang - def tokenizer(self, t:str) -> List[str]: return t.split(' ') - def add_special_cases(self, toks:Collection[str]): pass + """Basic class for a tokenizer function. (code from `fastai`)""" + def __init__(self, lang: str): self.lang = lang + + def tokenizer(self, t: str) -> List[str]: return t.split(' ') + + def add_special_cases(self, toks: Collection[str]): pass -def fix_html(x:str) -> str: - "List of replacements from html strings in `x`." +def replace_url(x): + """ + Replace url in `x` with TK_URL + + :param str x: text to replace url + + :return: text where urls are replaced + :rtype: str + + :Example: + + >>> from pythainlp.ulmfit import replace_url + >>> replace_url("go to github.com") + go to xxurl + """ + URL_PATTERN = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(? str: + """ + List of replacements from html strings in `x`. (code from `fastai`) + + :param str x: text to replace html string + + :return: text where html strings are replaced + :rtype: str + + :Example: + + >>> from pythainlp.ulmfit import fix_html + >>> fix_html("Anbsp;amp;nbsp;B @.@ ") + A & B. + """ re1 = re.compile(r' +') - x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace( - 'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace( - '
', "\n").replace('\\"', '"').replace('',UNK).replace(' @.@ ','.').replace( - ' @-@ ','-').replace(' @,@ ',',').replace('\\', ' \\ ') + x = x.replace('#39;', "'").replace('amp;', '&').replace( + '#146;', "'").replace('nbsp;', ' ').replace( + '#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace( + '
', "\n").replace('\\"', '"').replace('', UNK).replace( + ' @.@ ', '.').replace(' @-@ ', '-').replace(' @,@ ', ',').replace( + '\\', ' \\ ') return re1.sub(' ', html.unescape(x)) -def rm_useless_spaces(t:str) -> str: - "Remove multiple spaces in `t`." + +def rm_useless_spaces(t: str) -> str: + """Remove multiple spaces in `t`. (code from `fastai`)""" return re.sub(' {2,}', ' ', t) -def spec_add_spaces(t:str) -> str: - "Add spaces around / and # in `t`. \n" + +def spec_add_spaces(t: str) -> str: + """Add spaces around / and # in `t`. \n (code from `fastai`)""" return re.sub(r'([/#\n])', r' \1 ', t) ''' @@ -74,6 +114,7 @@ def spec_add_spaces(t:str) -> str: _THAI2FIT_WORDS = get_corpus("words_th_thai2fit_201810.txt") _pythainlp_tokenizer = Tokenizer(custom_dict=_THAI2FIT_WORDS, engine="newmm") + # Download pretrained models def _get_path(fname: str) -> str: """ @@ -122,7 +163,7 @@ def tokenizer(text: str) -> List[str]: >>> >>> text = "อาภรณ์, จินตมยปัญญา ภาวนามยปัญญา" >>> ThaiTokenizer.tokenizer(text) - ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา', + ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา', ' ', 'ภาวนามยปัญญา'] >>> >>> word_tokenize(text, engine='ulmfit') @@ -139,8 +180,22 @@ def add_special_cases(self, toks): def replace_rep_after(text: str) -> str: """ Replace repetitions at the character level in `text` after the repetition. - This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep 8 ย'; - instead it will retain the word as 'น้อย xxrep 8' + This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep 8 ย' + ;instead it will retain the word as 'น้อย xxrep 8' + + :param str text: input text to replace character repetition + + :return: text with repetitive token **xxrep** and the counter + after character repetition + + :rtype: str + :Example: + + >>> from pythainlp.ulmfit import replace_rep_after + >>> + >>> text = "กาาาาาาา" + >>> replace_rep_after(text) + 'กาxxrep7 ' """ def _replace_rep(m): @@ -151,23 +206,39 @@ def _replace_rep(m): return re_rep.sub(_replace_rep, text) -def replace_wrep_post(toks:Collection): + +def replace_wrep_post(toks: Collection): """ - Replace reptitive words post tokenization; + Replace reptitive words post tokenization; fastai `replace_wrep` does not work well with Thai. + + :param list[str] toks: list of tokens + + :return: list of tokens where **xxwrep** token and the counter + is added in front of repetitive words. + :rtype: list[str] + + :Example: + + >>> from pythainlp.ulmfit import replace_wrep_post_nonum + >>> + >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] + >>> replace_wrep_post(toks) + ['กา', 'xxwrep', '3', 'น้ำ'] + """ previous_word = None rep_count = 0 res = [] for current_word in toks+[TK_END]: - if current_word==previous_word: - rep_count+=1 - elif (current_word!=previous_word) & (rep_count>0): - res += [TK_WREP,str(rep_count),previous_word] - rep_count=0 + if current_word == previous_word: + rep_count += 1 + elif (current_word != previous_word) & (rep_count > 0): + res += [TK_WREP, str(rep_count), previous_word] + rep_count = 0 else: res.append(previous_word) - previous_word=current_word + previous_word = current_word return res[1:] @@ -186,7 +257,7 @@ def rm_brackets(text: str) -> str: return new_line -def ungroup_emoji(toks:Collection): +def ungroup_emoji(toks: Collection): "Ungroup emojis" res = [] for tok in toks: @@ -197,17 +268,34 @@ def ungroup_emoji(toks:Collection): res.append(tok) return res -def lowercase_all(toks:Collection): - """Lowercase all English words; + +def lowercase_all(toks: Collection): + """Lowercase all English words; English words in Thai texts don't usually have nuances of capitalization. """ return [tok.lower() for tok in toks] + def replace_rep_nonum(text: str) -> str: """ Replace repetitions at the character level in `text` after the repetition. - This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xrep 8 ย'; - instead it will retain the word as 'น้อย xrep 8' + This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep ย'; + instead it will retain the word as 'น้อย xxrep ' + + :param str text: input text to replace character repetition + + :return: text with repetitive token **xxrep** after + character repetition + :rtype: str + + :Example: + + >>> from pythainlp.ulmfit import replace_rep_nonum + >>> + >>> text = "กาาาาาาา" + >>> replace_rep_nonum(text) + 'กา xxrep ' + """ def _replace_rep(m): c, cc = m.groups() @@ -215,38 +303,62 @@ def _replace_rep(m): re_rep = re.compile(r"(\S)(\1{3,})") return re_rep.sub(_replace_rep, text) -def replace_wrep_post_nonum(toks:Collection): + +def replace_wrep_post_nonum(toks: Collection): """ - Replace reptitive words post tokenization; + Replace reptitive words post tokenization; fastai `replace_wrep` does not work well with Thai. + + :param list[str] toks: list of tokens + + :return: list of tokens where **xxwrep** token is added in front of + repetitive words. + :rtype: list[str] + + :Example: + + >>> from pythainlp.ulmfit import replace_wrep_post_nonum + >>> + >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] + >>> replace_wrep_post_nonum(toks) + ['กา', 'xxwrep', 'น้ำ'] + """ previous_word = None rep_count = 0 res = [] for current_word in toks+[TK_END]: - if current_word==previous_word: - rep_count+=1 - elif (current_word!=previous_word) & (rep_count>0): - res += [TK_WREP,previous_word] - rep_count=0 + if current_word == previous_word: + rep_count += 1 + elif (current_word != previous_word) & (rep_count > 0): + res += [TK_WREP, previous_word] + rep_count = 0 else: res.append(previous_word) - previous_word=current_word + previous_word = current_word return res[1:] -def remove_space(toks:Collection): + +def remove_space(toks: Collection): """ Do not include space for bag-of-word models. + + :param list[str] toks: list of tokens + + :return: list of tokens where space tokens (" ") are filtered out + :rtype: list[str] """ res = [] for t in toks: - if t!=' ': res.append(t) + if t != ' ': + res.append(t) return res # Pretrained paths # TODO: Let the user decide if they like to download (at setup?) _THWIKI_LSTM = dict( - wgts_fname=_get_path(_MODEL_NAME_LSTM), itos_fname=_get_path(_ITOS_NAME_LSTM) + wgts_fname=_get_path(_MODEL_NAME_LSTM), + itos_fname=_get_path(_ITOS_NAME_LSTM) ) # Preprocessing rules for Thai text @@ -259,30 +371,90 @@ def remove_space(toks:Collection): rm_useless_spaces, rm_useless_newlines, rm_brackets, + replace_url, ] + post_rules_th = [replace_wrep_post, ungroup_emoji, lowercase_all,] # sparse features pre_rules_th_sparse = pre_rules_th[1:] + [replace_rep_nonum] -post_rules_th_sparse = post_rules_th[1:] + [replace_wrep_post_nonum, remove_space] +post_rules_th_sparse = post_rules_th[1:] + [replace_wrep_post_nonum, + remove_space] + -def process_thai(text: str, pre_rules: Collection = pre_rules_th_sparse, tok_func:Callable = _pythainlp_tokenizer.word_tokenize, - post_rules: Collection = post_rules_th_sparse) -> Collection[str]: +def process_thai(text: str, pre_rules: Collection = pre_rules_th_sparse, + tok_func: Callable = _pythainlp_tokenizer.word_tokenize, + post_rules: Collection = post_rules_th_sparse) -> Collection[str]: """ Process Thai texts for models (with sparse features as default) + :param str text: text to be cleaned - :param pre_rules List: rules to apply before tokenization - :param tok_func Callable: tokenization function - :param post_rules List: rules to apply after tokenization + :param list[func] pre_rules: rules to apply before tokenization. + :param func tok_func: tokenization function (by default, **tok_func** is + :func:`pythainlp.tokenize.word_tokenize`) + + :param list[func] post_rules: rules to apply after tokenizations + :return: a list of cleaned tokenized texts + :rtype: list[str] + + + :Note: + - The default **pre-rules** consists of :func:`fix_html`, + :func:`pythainlp.util.normalize`, + :func:`spec_add_spaces`, + :func:`rm_useless_spaces`, + :func:`rm_useless_newlines`, + :func:`rm_brackets` + and :func:`replace_rep_nonum`. + + - The default **post-rules** consists of :func:`ungroup_emoji`, + :func:`lowercase_all`, :func:`replace_wrep_post_nonum`, + and :func:`remove_space`. + + :Example: + + 1. Use default pre-rules and post-rules: + + >>> from pythainlp.ulmfit import process_thai + >>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp; " + >>> process_thai(text) + [บ้าน', 'xxrep', ' ', 'อยู่', 'xxwrep', 'นาน', '😂', '🤣', + '😃', '😄', '😅', 'pythainlp', '&'] + + 2. Modify pre_rules and post_rules arugments with + rules provided in :mod:`pythainlp.ulmfit`: + + >>> from pythainlp.ulmfit import ( + process_thai, + replace_rep_after, + fix_html, + ungroup_emoji, + replace_wrep_post, + remove_space) + >>> + >>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp; " + >>> process_thai(text, + pre_rules=[replace_rep_after, fix_html], + post_rules=[ungroup_emoji, + replace_wrep_post, + remove_space] + ) + ['บ้าน', 'xxrep', '5', '()', 'อยู่', 'xxwrep', '2', 'นาน', '😂', '🤣', + '😃', '😄', '😅', 'PyThaiNLP', '&'] + + """ res = text - for pre in pre_rules: res = pre(res) + for pre in pre_rules: + res = pre(res) res = tok_func(res) - for post in post_rules: res = post(res) + for post in post_rules: + res = post(res) return res _tokenizer = ThaiTokenizer() + def document_vector(text: str, learn, data, agg: str = "mean"): """ This function vectorize Thai input text into a 400 dimension vector using @@ -326,7 +498,8 @@ def document_vector(text: str, learn, data, agg: str = "mean"): """ s = _tokenizer.tokenizer(text) - t = torch.tensor(data.vocab.numericalize(s), requires_grad=False).to(device) + t = torch.tensor(data.vocab.numericalize(s), + requires_grad=False).to(device) m = learn.model[0].encoder.to(device) res = m(t).cpu().detach().numpy() if agg == "mean": diff --git a/setup.py b/setup.py index 85739ef8b..bc8830c87 100644 --- a/setup.py +++ b/setup.py @@ -10,20 +10,21 @@ extras = { "artagger": ["artagger"], + "attacut": ["attacut"], "deepcut": ["deepcut", "keras", "tensorflow"], "icu": ["pyicu"], "ipa": ["epitran"], "ssg": ["ssg"], - "ml": ["fastai>=1.0.38", "keras", "numpy", "torch"], + "ml": ["keras", "numpy", "torch"], "ner": ["sklearn-crfsuite"], "thai2fit": ["emoji", "gensim", "numpy"], "thai2rom": ["torch", "numpy"], "benchmarks": ["numpy", "pandas"], "full": [ "artagger", + "attacut", "deepcut", "epitran", - "fastai>=1.0.38", "gensim", "keras", "numpy", @@ -39,7 +40,7 @@ setup( name="pythainlp", - version="2.1.dev2", + version="2.1.dev4", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", diff --git a/tests/data/sentences.yml b/tests/data/sentences.yml index 6d913a38d..3c8a0dcc9 100644 --- a/tests/data/sentences.yml +++ b/tests/data/sentences.yml @@ -39,4 +39,8 @@ binary_sentences: - expected: "10001010" actual: "10101000" - expected_count: 0 \ No newline at end of file + expected_count: 0 + - + expected: "10101001000" # "ฝน|ตก|ที่|ทะเล + actual: "10001001010" # "ฝนตก|ที่|ทะ|เล" + expected_count: 1 \ No newline at end of file diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index aad63fd76..e0c95892c 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -1,31 +1,29 @@ -import datetime -import os -import sys import unittest -import yaml + import numpy as np +import yaml +from pythainlp.benchmarks import word_tokenization -from pythainlp.benchmarks import word_tokenisation -with open("./tests/data/sentences.yml", 'r') as stream: +with open("./tests/data/sentences.yml", "r", encoding="utf8") as stream: TEST_DATA = yaml.safe_load(stream) class TestBenchmarksPackage(unittest.TestCase): def test_preprocessing(self): - self.assertIsNotNone(word_tokenisation.preprocessing( + self.assertIsNotNone(word_tokenization.preprocessing( txt="ทดสอบ การ ทำ ความสะอาด ข้อมูลok" )) def test_benchmark_not_none(self): - self.assertIsNotNone(word_tokenisation.benchmark( + self.assertIsNotNone(word_tokenization.benchmark( ["วัน", "จัน", "ทร์", "สี", "เหลือง"], ["วัน", "จันทร์", "สี", "เหลือง"] )) def test_binary_representation(self): sentence = "อากาศ|ร้อน|มาก|ครับ" - rept = word_tokenisation._binary_representation(sentence) + rept = word_tokenization._binary_representation(sentence) self.assertEqual( [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], @@ -36,9 +34,9 @@ def test_compute_stats(self): for pair in TEST_DATA['sentences']: exp, act = pair['expected'], pair['actual'] - result = word_tokenisation.compute_stats( - word_tokenisation.preprocessing(exp), - word_tokenisation.preprocessing(act) + result = word_tokenization.compute_stats( + word_tokenization.preprocessing(exp), + word_tokenization.preprocessing(act) ) self.assertIsNotNone(result) @@ -50,7 +48,7 @@ def test_benchmark(self): expected.append(pair['expected']) actual.append(pair['actual']) - df = word_tokenisation.benchmark(expected, actual) + df = word_tokenization.benchmark(expected, actual) self.assertIsNotNone(df) @@ -59,10 +57,15 @@ def test_count_correctly_tokenised_words(self): sample = np.array(list(d['actual'])).astype(int) ref_sample = np.array(list(d['expected'])).astype(int) - wb = list(word_tokenisation._find_word_boudaries(ref_sample)) + sb = list(word_tokenization._find_word_boudaries(sample)) + rb = list(word_tokenization._find_word_boudaries(ref_sample)) + + # in binary [{0, 1}, ...] + correctly_tokenized_words = word_tokenization\ + ._find_words_correctly_tokenised(rb, sb) self.assertEqual( - word_tokenisation._count_correctly_tokenised_words(sample, wb), + np.sum(correctly_tokenized_words), d['expected_count'] ) @@ -72,7 +75,7 @@ def test_words_correctly_tokenised(self): expected = "01" - labels = word_tokenisation._find_words_correctly_tokenised(r, s) + labels = word_tokenization._find_words_correctly_tokenised(r, s) self.assertEqual(expected, "".join(np.array(labels).astype(str))) def test_flatten_result(self): @@ -81,5 +84,5 @@ def test_flatten_result(self): key2=dict(v2=7) ) - actual = word_tokenisation._flatten_result(result) - self.assertEqual(actual, {'key1:v1': 6, 'key2:v2': 7}) \ No newline at end of file + actual = word_tokenization._flatten_result(result) + self.assertEqual(actual, {'key1:v1': 6, 'key2:v2': 7}) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 9ddc4ed83..e4aca4a9e 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -12,6 +12,7 @@ ) from pythainlp.tokenize import DEFAULT_DICT_TRIE, Tokenizer from pythainlp.tokenize import deepcut as tokenize_deepcut +from pythainlp.tokenize import attacut from pythainlp.tokenize import ( dict_trie, dict_word_tokenize, @@ -70,6 +71,9 @@ def test_word_tokenize(self): self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX") ) + self.assertIsNotNone( + word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut") + ) self.assertIsNotNone(dict_trie(())) self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie"))) @@ -174,6 +178,16 @@ def test_word_tokenize_newmm(self): ["จุ๋ม", "ง่วง"], ) + + def test_word_tokenize_attacut(self): + self.assertEqual(attacut.segment(None), []) + self.assertEqual(attacut.segment(""), []) + self.assertEqual( + word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="attacut"), + ['ฉัน', 'รัก', 'ภาษา', 'ไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คน', 'ไทย'], + ) + + def test_sent_tokenize(self): self.assertEqual(sent_tokenize(None), []) self.assertEqual(sent_tokenize(""), []) diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index 1401190d5..5084f2dfe 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -2,10 +2,36 @@ import datetime import os -import sys import unittest -from pythainlp.ulmfit import * +from pythainlp.corpus import get_corpus +from pythainlp.tokenize import Tokenizer +from pythainlp.ulmfit import ( + ThaiTokenizer, + BaseTokenizer, + fix_html, + _THWIKI_LSTM, + pre_rules_th, + post_rules_th, + pre_rules_th_sparse, + post_rules_th_sparse, + rm_useless_spaces, + spec_add_spaces, + rm_useless_newlines, + rm_brackets, + ungroup_emoji, + lowercase_all, + replace_rep_nonum, + replace_rep_after, + replace_wrep_post, + replace_wrep_post_nonum, + remove_space, + process_thai +) + +_THAI2FIT_WORDS = get_corpus("words_th_thai2fit_201810.txt") +_pythainlp_tokenizer = Tokenizer(custom_dict=_THAI2FIT_WORDS, engine="newmm") + class TestUlmfitPackage(unittest.TestCase): @@ -13,9 +39,152 @@ def test_ThaiTokenizer(self): self.thai = ThaiTokenizer() self.assertIsNotNone(self.thai.tokenizer("ทดสอบการตัดคำ")) self.assertIsNone(self.thai.add_special_cases(["แมว"])) + + def test_BaseTokenizer(self): + self.base = BaseTokenizer(lang='th') + self.assertIsNotNone(self.base.tokenizer("ทดสอบ การ ตัด คำ")) + self.assertIsNone(self.base.add_special_cases(["แมว"])) + def test_load_pretrained(self): self.assertIsNotNone(_THWIKI_LSTM) + def test_pre_rules_th(self): self.assertIsNotNone(pre_rules_th) + def test_post_rules_th(self): self.assertIsNotNone(post_rules_th) + + def test_pre_rules_th(self): + self.assertIsNotNone(pre_rules_th_sparse) + + def test_post_rules_th(self): + self.assertIsNotNone(post_rules_th_sparse) + + def test_fix_html(self): + self.assertEqual( + fix_html("Some HTML text
"), + "Some HTML& text\n") + + def test_rm_useless_spaces(self): + self.assertEqual( + rm_useless_spaces("Inconsistent use of spaces."), + "Inconsistent use of spaces.") + + def test_spec_add_spaces(self): + self.assertEqual( + spec_add_spaces("I #like to #put #hashtags #everywhere!"), + "I # like to # put # hashtags # everywhere!") + + def test_replace_rep_after(self): + self.assertEqual( + replace_rep_after("น้อยยยยยยยย"), + "น้อยxxrep8 ") + + def test_replace_rep_nonum(self): + self.assertEqual( + replace_rep_nonum("น้อยยยยยยยย"), + "น้อย xxrep ") + + def test_replace_wrep_post(self): + self.assertEqual( + replace_wrep_post(["น้อย", "น้อย"]), + ["xxwrep", "1", "น้อย"]) + + self.assertEqual( + replace_wrep_post(["นก", "กา", "กา", "กา"]), + ["นก", "xxwrep", "2", "กา"]) + + def test_replace_wrep_post_nonum(self): + self.assertEqual( + replace_wrep_post_nonum(["น้อย", "น้อย"]), + ["xxwrep", "น้อย"]) + + self.assertEqual( + replace_wrep_post_nonum(["นก", "กา", "กา", "กา"]), + ["นก", "xxwrep", "กา"]) + + def test_remove_space(self): + self.assertEqual( + remove_space([" ", "น้อย", " ", "."]), + ["น้อย", "."]) + + def test_rm_useless_newlines(self): + self.assertEqual( + rm_useless_newlines("text\n\n"), + "text ") + + def test_rm_brackets(self): + self.assertEqual( + rm_brackets("()()(ข้อความ)"), + "(ข้อความ)") + self.assertEqual( + rm_brackets("[][][ข้อความ]"), + "[ข้อความ]") + self.assertEqual( + rm_brackets("{}{}{ข้อความ}"), + "{ข้อความ}") + + def test_ungroup_emoji(self): + self.assertEqual( + ungroup_emoji("👍👍👍"), + ["👍", "👍", "👍"]) + + def test_lowercase_all(self): + self.assertEqual( + lowercase_all("HeLlO ."), + ['h', 'e', 'l', 'l', 'o', ' ', '.']) + + def test_process_thai_1(self): + """rules for sparse features""" + + text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146" + + actual = process_thai(text) + + # after pre_rules_th_sparse + # >>> "👍👍👍 # Ana มาก xxrep น้้อยน้อย .1146" + # + # after tokenize with word_tokenize(engine="newmm") + # >>> ["👍👍👍", " ", "#", " ","Ana", " ", "มาก", "xxrep", + # " ", "น้อย", "น้อย", " ", ".", "1146"] + # + # after post_rules_th + # - remove whitespace token (" ") + # >>> ["xxwrep, "👍", "#", "ana", "มาก", + # "xxrep", " ", "xxwrep", "น้อย", ".", "1146"] + + expect = ["xxwrep", "👍", "#", "ana", "มาก", "xxrep", + " ", "xxwrep", "น้อย", ".", "1146"] + + self.assertEqual(actual, expect) + + def test_process_thai_2(self): + """rules for dense features""" + + text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146" + + actual = process_thai(text, + pre_rules=pre_rules_th, + post_rules=post_rules_th, + tok_func=_pythainlp_tokenizer.word_tokenize) + + # after pre_rules_th + # >>> "👍👍👍 # Ana มากxxrep4 น้้อยน้อย .1146" + # + # after tokenize with word_tokenize(engine="newmm") + # >>> ["👍👍👍", " ", "#", "Ana", " ", "มาก", "xxrep", "4", + # " ", "น้อย", "น้อย", " ", ".", "1146"] + # after post_rules_th + # -- because it performs `replace_wrep_post` before `ungroup_emoji`, + # 3 repetitive emoji are not marked with special token "xxwrep num" + # + # >>> ["👍", "👍","👍", " ", "#", "ana", " ", "มาก", + # "xxrep", "4", " ", "xxwrep", "1", "น้อย", " ", + # ".", "1146"] + + expect = ["👍", "👍", "👍", " ", "#", " ", + "ana", " ", "มาก", "xxrep", "4", + " ", "xxwrep", "1", "น้อย", " ", + ".", "1146"] + + self.assertEqual(actual, expect)