In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## sequential

model_seq = keras.Sequential()
model_seq.add(keras.Input(shape=(10,)))
model_seq.add(keras.layers.Dense(64, 'relu'))
model_seq.add(keras.layers.Dropout(0.2))
model_seq.add(keras.layers.Dense(32, 'relu'))
model_seq.add(keras.layers.Dense(1,'sigmoid'))

model_seq.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                704       
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,817
Trainable params: 2,817
Non-trainable params: 0
_________________________________________________________________


In [3]:
## functional API

inputs = keras.Input(shape=(10,))
hidden1 = keras.layers.Dense(64, 'relu')(inputs)
dropout = keras.layers.Dropout(0.2)(hidden1)
hidden2 = keras.layers.Dense(10, 'softmax')(dropout)

model_func = keras.Model(inputs, hidden2)
model_func.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 10)]              0         
                                                                 
 dense_3 (Dense)             (None, 64)                704       
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 10)                650       
                                                                 
Total params: 1,354
Trainable params: 1,354
Non-trainable params: 0
_________________________________________________________________


In [4]:
## subclassing API

class model_sub(keras.Model):
    def __init__(self, hidden, drop_rate, outputs):
        super(model_sub,self).__init__()
        self.hidden = keras.layers.Dense(hidden, 'relu')
        self.dropout = keras.layers.Dropout(drop_rate)
        self.outputs = keras.layers.Dense(outputs,'softmax')
        
    def call(self, inputs):
        x = self.hidden(inputs)
        x = self.dropout(x)
        x = self.outputs(x)
        
        return x
    
model = model_sub(64,0.2,10)

In [5]:
model.build(input_shape=(1,100))

model.summary()

Model: "model_sub"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             multiple                  6464      
                                                                 
 dropout_2 (Dropout)         multiple                  0         
                                                                 
 dense_6 (Dense)             multiple                  650       
                                                                 
Total params: 7,114
Trainable params: 7,114
Non-trainable params: 0
_________________________________________________________________


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [7]:
text_data = ['나는 배가 고프다', 
             '내일 점심 뭐먹지', 
             '내일 공부 해야겠다', 
             '점심 먹고 공부 해야지']

In [11]:
countVectorizer = CountVectorizer()

countVectorizer.fit(text_data)
Count_vector = countVectorizer.transform(text_data)
DTM = Count_vector.toarray()
print(DTM)
print('='*50)
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(text_data)
Tfidf_vector = tfidf_vectorizer.transform(text_data).toarray()
print(Tfidf_vector)

[[1 0 1 0 0 0 1 0 0 0]
 [0 0 0 1 0 1 0 1 0 0]
 [0 1 0 1 0 0 0 0 1 0]
 [0 1 0 0 1 0 0 1 0 1]]
[[0.57735027 0.         0.57735027 0.         0.         0.
  0.57735027 0.         0.         0.        ]
 [0.         0.         0.         0.52640543 0.         0.66767854
  0.         0.52640543 0.         0.        ]
 [0.         0.52640543 0.         0.52640543 0.         0.
  0.         0.         0.66767854 0.        ]
 [0.         0.43779123 0.         0.         0.55528266 0.
  0.         0.43779123 0.         0.55528266]]


In [9]:
countVectorizer.vocabulary_

{'나는': 2,
 '배가': 6,
 '고프다': 0,
 '내일': 3,
 '점심': 7,
 '뭐먹지': 5,
 '공부': 1,
 '해야겠다': 8,
 '먹고': 4,
 '해야지': 9}

In [13]:
# pip install nltk
import nltk

In [14]:
nltk.download('all-corpora')
nltk.download('punkt')

[nltk_data] Downloading collection 'all-corpora'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\hyun9\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\hyun9\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     C:\Users\hyun9\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\hyun9\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\hyun9\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\hyun9\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown_tei.zip.
[

True

In [15]:
sentence = "Natural language processing (NLP) is a subfield of computer science, \
information engineering, and artificial intelligence concerned \
with the interactions between computers and human (natural) languages, \
in particular how to program computers to process and analyze \
large amounts of natural language data."

print(sentence)

Natural language processing (NLP) is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.


In [16]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [17]:
nltk_word = word_tokenize(sentence)

display(nltk_word)

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'subfield',
 'of',
 'computer',
 'science',
 ',',
 'information',
 'engineering',
 ',',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 '(',
 'natural',
 ')',
 'languages',
 ',',
 'in',
 'particular',
 'how',
 'to',
 'program',
 'computers',
 'to',
 'process',
 'and',
 'analyze',
 'large',
 'amounts',
 'of',
 'natural',
 'language',
 'data',
 '.']

In [18]:
display(sentence.split())

['Natural',
 'language',
 'processing',
 '(NLP)',
 'is',
 'a',
 'subfield',
 'of',
 'computer',
 'science,',
 'information',
 'engineering,',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 '(natural)',
 'languages,',
 'in',
 'particular',
 'how',
 'to',
 'program',
 'computers',
 'to',
 'process',
 'and',
 'analyze',
 'large',
 'amounts',
 'of',
 'natural',
 'language',
 'data.']

In [20]:
nltk_sent = sent_tokenize(sentence)

print(nltk_sent)
print(len(nltk_sent))

['Natural language processing (NLP) is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.']
1


In [1]:
! java -version

java version "1.8.0_381"
Java(TM) SE Runtime Environment (build 1.8.0_381-b09)
Java HotSpot(TM) 64-Bit Server VM (build 25.381-b09, mixed mode)


In [1]:
import sys
sys.version

'3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]'

In [4]:
# pip install ./JPype1-1.4.0-cp38-cp38-win_amd64.whl

Processing .\jpype1-1.4.0-cp38-cp38-win_amd64.whl
Installing collected packages: JPype1
Successfully installed JPype1-1.4.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
# pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
     --------------------------------------- 19.4/19.4 MB 43.7 MB/s eta 0:00:00
Collecting lxml>=4.1.0
  Downloading lxml-4.9.3-cp38-cp38-win_amd64.whl (3.9 MB)
     ---------------------------------------- 3.9/3.9 MB 82.8 MB/s eta 0:00:00
Installing collected packages: lxml, konlpy
Successfully installed konlpy-0.6.0 lxml-4.9.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
from konlpy.tag import Okt

In [3]:
okt = Okt()

In [7]:
text = "한글 자연어 처리는 재밌다 이제부터 열심히 해야 ㅎㅎㅎ"

print(text.split())
print(okt.nouns(text))
print(okt.morphs(text))
print(okt.morphs(text, stem= True))
print(okt.phrases(text))
print(okt.pos(text))

['한글', '자연어', '처리는', '재밌다', '이제부터', '열심히', '해야', 'ㅎㅎㅎ']
['한글', '자연어', '처리', '이제']
['한글', '자연어', '처리', '는', '재밌다', '이제', '부터', '열심히', '해야', 'ㅎㅎㅎ']
['한글', '자연어', '처리', '는', '재밌다', '이제', '부터', '열심히', '하다', 'ㅎㅎㅎ']
['한글', '한글 자연어', '한글 자연어 처리', '이제', '자연어', '처리']
[('한글', 'Noun'), ('자연어', 'Noun'), ('처리', 'Noun'), ('는', 'Josa'), ('재밌다', 'Adjective'), ('이제', 'Noun'), ('부터', 'Josa'), ('열심히', 'Adverb'), ('해야', 'Verb'), ('ㅎㅎㅎ', 'KoreanParticle')]


In [8]:
from konlpy.corpus import kolaw, kobill

In [10]:
kolaw.open('constitution.txt').read()

kobill.open('1809890.txt').read()

'지방공무원법 일부개정법률안\n\n(정의화의원 대표발의 )\n\n 의 안\n 번 호\n\n9890\n\n발의연월일 : 2010.  11.  12.  \n\n발  의  자 : 정의화․이명수․김을동 \n\n이사철․여상규․안규백\n\n황영철․박영아․김정훈\n\n김학송 의원(10인)\n\n제안이유 및 주요내용\n\n  초등학교 저학년의 경우에도 부모의 따뜻한 사랑과 보살핌이 필요\n\n한 나이이나, 현재 공무원이 자녀를 양육하기 위하여 육아휴직을 할 \n\n수 있는 자녀의 나이는 만 6세 이하로 되어 있어 초등학교 저학년인 \n\n자녀를 돌보기 위해서는 해당 부모님은 일자리를 그만 두어야 하고 \n\n이는 곧 출산의욕을 저하시키는 문제로 이어질 수 있을 것임.\n\n  따라서 육아휴직이 가능한 자녀의 연령을 만 8세 이하로 개정하려\n\n는 것임(안 제63조제2항제4호).\n\n- 1 -\n\n\x0c법률  제        호\n\n지방공무원법 일부개정법률안\n\n지방공무원법 일부를 다음과 같이 개정한다.\n\n제63조제2항제4호 중 “만 6세 이하의 초등학교 취학 전 자녀를”을 “만 \n\n8세 이하(취학 중인 경우에는 초등학교 2학년 이하를 말한다)의 자녀를”\n\n로 한다.\n\n부      칙\n\n이 법은 공포한 날부터 시행한다.\n\n- 3 -\n\n\x0c신 ·구조문대비표\n\n현      행\n\n개   정   안\n\n제63조(휴직) ① (생  략)\n\n제63조(휴직) ① (현행과 같음)\n\n  ② 공무원이 다음 각 호의 어\n\n  ② -------------------------\n\n느 하나에 해당하는 사유로 휴\n\n----------------------------\n\n직을 원하면 임용권자는 휴직\n\n----------------------------\n\n을 명할 수 있다. 다만, 제4호\n\n-------------.---------------\n\n의 경우에는 대통령령으로 정\n\n---------------------------

In [13]:
## python string 함수

a = ' Natural language  '
print(a)
a

 Natural language  


' Natural language  '

In [17]:
a.count('a')
a.find('a')
a.find('al')
a.find('w')

-1

In [20]:
print(' '.join(a))
print(','.join('kkkk'))
print(','.join(['kkk','llll']))

  N a t u r a l   l a n g u a g e    
k,k,k,k
kkk,llll


In [22]:
a.upper()
a.lower()

' natural language  '

In [25]:
a.strip()
a.lstrip()
a.rstrip()

' Natural language'

In [27]:
print(a)

a.replace('a','ko')

a.split()

 Natural language  


['Natural', 'language']

In [29]:

print('I eat {} apples a dat'.format(3))

I eat 3 apples a dat


In [30]:
import re

In [34]:
## 메타문자, '.'
print(re.search('ab','aababc'))
print(re.search('a.b','aababc'))
print(re.search('a.b','axbabc'))

<re.Match object; span=(1, 3), match='ab'>
<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(0, 3), match='axb'>


In [37]:
ind = re.search('a.b', 'axbawbc')
print(ind)

print(ind.start())
print(ind.end())

<re.Match object; span=(0, 3), match='axb'>
0
3


In [39]:
# [.]

print(re.search('a[.]b', 'aabccab'))
print(re.search('a[.]b', 'aabcca.b'))


None
<re.Match object; span=(5, 8), match='a.b'>


In [42]:
# '*'

print(re.search('a*b','aababc'))
print(re.search('a*b','aaaaaaababc'))
print(re.search('a*b','babc'))

<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(0, 8), match='aaaaaaab'>
<re.Match object; span=(0, 1), match='b'>


In [45]:
# '+', 
print(re.search('a+b', 'aababc'))
print(re.search('a+b', 'aaaaababc'))
print(re.search('a+b', 'babc'))

<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(0, 6), match='aaaaab'>
<re.Match object; span=(1, 3), match='ab'>


In [46]:
# '?' , 반복을 의미하는 ? 메타 문자, 앞에 있는 문자 a가 0또는 1부터 반복

print(re.search('a?b', 'aababc'))
print(re.search('a?b', 'aaaaababc'))
print(re.search('a?b', 'babc'))

<re.Match object; span=(1, 3), match='ab'>
<re.Match object; span=(4, 6), match='ab'>
<re.Match object; span=(0, 1), match='b'>


In [52]:
# 반복 (a{3,5}b, ?), a{4}b

print(re.search('a{2}b', 'aababc'))
print(re.search('a{2}b', 'abaabc'))

print(re.search('a{2}b', 'ababc'))

print(re.search('a{3,5}b', 'aababc'))
print(re.search('a{3,5}b', 'aaaababc'))

<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(2, 5), match='aab'>
None
None
<re.Match object; span=(0, 5), match='aaaab'>


In [58]:
# [abc]

p = re.compile('[a-z]+')

print(p.match(' python'))
print(p.search(' python'))

print(p.match('python').start())
print(p.match('python').end())
print(p.match('python').group())

None
<re.Match object; span=(1, 7), match='python'>
0
6
python


In [62]:
p = re.compile('sk?t')
m = p.match('string goes here')

if m:
    print('Matched here', m.group())
else:
    print('No matched here')

Matched here st


In [60]:
p = re.compile('[a-z]+')
print(p.match('4 python'))
print(p.search('4 python'))

None
<re.Match object; span=(2, 8), match='python'>


In [67]:
## findall
p = re.compile('[a-zA-Z]+')
# p.findall('you look only once YOLO')
print(p.findall('life is too short'))
print('life is too short'.split())

['life', 'is', 'too', 'short']
['life', 'is', 'too', 'short']


In [68]:
result = p.finditer('life is too short')

for i in result:
    print(i)

<re.Match object; span=(0, 4), match='life'>
<re.Match object; span=(5, 7), match='is'>
<re.Match object; span=(8, 11), match='too'>
<re.Match object; span=(12, 17), match='short'>


In [71]:
# p = re.compile('[a-z]+')
p = re.compile('[a-z]+', re.I)

print(p.search('python'))
print(p.search('Python'))

<re.Match object; span=(0, 6), match='python'>
<re.Match object; span=(0, 6), match='Python'>


In [74]:
##
p = re.compile('^python\s\w+', re.MULTILINE)

data = '''python one
life is too short
python two
you need python
python three
'''

p.findall(data)

['python one', 'python two', 'python three']

In [78]:
p = re.compile('python|Hello')

print(p.match('python and hello'))
print(p.search('python and Hello'))
print(p.search('Hello and python'))

<re.Match object; span=(0, 6), match='python'>
<re.Match object; span=(0, 6), match='python'>
<re.Match object; span=(0, 5), match='Hello'>


In [81]:
p = re.compile('Hello|python')

print(p.search('python and Hello'))

<re.Match object; span=(0, 6), match='python'>


In [82]:
## ^문장의 시작, $문장의 끝
print(re.search('Life', 'Life is too short'))
print(re.search('Life', 'Short is my Life'))

print(re.search('^Life', 'Life is too short'))
print(re.search('^Life', 'Short is my Life'))

<re.Match object; span=(0, 4), match='Life'>
<re.Match object; span=(12, 16), match='Life'>
<re.Match object; span=(0, 4), match='Life'>
None


In [83]:
print(re.search('Life', 'Life is too short'))
print(re.search('Life', 'Short is my Life'))

print(re.search('Life$', 'Life is too short'))
print(re.search('Life$', 'Short is my Life'))

<re.Match object; span=(0, 4), match='Life'>
<re.Match object; span=(12, 16), match='Life'>
None
<re.Match object; span=(12, 16), match='Life'>


In [87]:
p = re.compile('(ABC)')

p.search('ABCABCABC OK?')

<re.Match object; span=(0, 3), match='ABC'>

In [131]:
text1 = 'park 010-1234-1234'
text2 = 'kim 010-7984-7984'
text3 = 'lee 01045697816'

p = re.compile('\d+[-]?\d+[-]?\d+')

print(p.search(text3))
print(p.search(text2))

<re.Match object; span=(4, 15), match='01045697816'>
<re.Match object; span=(4, 17), match='010-7984-7984'>


In [132]:
import kaggle

In [133]:
! kaggle competitions list

ref                                                                                           deadline             category            reward  teamCount  userHasEntered  
--------------------------------------------------------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
https://www.kaggle.com/competitions/asl-fingerspelling                                        2023-08-24 23:59:00  Research          $200,000        805           False  
https://www.kaggle.com/competitions/icr-identify-age-related-conditions                       2023-08-10 23:59:00  Featured           $60,000       5870           False  
https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries                      2023-10-11 23:59:00  Featured           $60,000        284           False  
https://www.kaggle.com/competitions/bengaliai-speech                                          2023-10-17 23:59:00  Research           $53,000    

In [135]:
!kaggle datasets download -d ymanojkumar023/kumarmanoj-bag-of-words-meets-bags-of-popcorn

Downloading kumarmanoj-bag-of-words-meets-bags-of-popcorn.zip to c:\tensorflow_sample\class_sample




  0%|          | 0.00/52.4M [00:00<?, ?B/s]
  2%|▏         | 1.00M/52.4M [00:00<00:41, 1.31MB/s]
  4%|▍         | 2.00M/52.4M [00:00<00:20, 2.64MB/s]
  8%|▊         | 4.00M/52.4M [00:01<00:08, 5.69MB/s]
 15%|█▌        | 8.00M/52.4M [00:01<00:03, 12.2MB/s]
 19%|█▉        | 10.0M/52.4M [00:01<00:03, 12.4MB/s]
 25%|██▍       | 13.0M/52.4M [00:01<00:02, 16.2MB/s]
 32%|███▏      | 17.0M/52.4M [00:01<00:01, 20.9MB/s]
 38%|███▊      | 20.0M/52.4M [00:01<00:01, 23.0MB/s]
 46%|████▌     | 24.0M/52.4M [00:01<00:01, 26.4MB/s]
 53%|█████▎    | 28.0M/52.4M [00:01<00:00, 28.7MB/s]
 59%|█████▉    | 31.0M/52.4M [00:02<00:00, 29.1MB/s]
 67%|██████▋   | 35.0M/52.4M [00:02<00:00, 30.9MB/s]
 74%|███████▍  | 39.0M/52.4M [00:02<00:00, 31.2MB/s]
 82%|████████▏ | 43.0M/52.4M [00:02<00:00, 32.4MB/s]
 90%|████████▉ | 47.0M/52.4M [00:02<00:00, 32.0MB/s]
 97%|█████████▋| 51.0M/52.4M [00:02<00:00, 32.8MB/s]
100%|██████████| 52.4M/52.4M [00:02<00:00, 20.6MB/s]


In [136]:
!kaggle competitions download -c word2vec-nlp-tutorial

Downloading word2vec-nlp-tutorial.zip to c:\tensorflow_sample\class_sample




  0%|          | 0.00/51.7M [00:00<?, ?B/s]
  2%|▏         | 1.00M/51.7M [00:00<00:39, 1.35MB/s]
  4%|▍         | 2.00M/51.7M [00:00<00:19, 2.70MB/s]
  8%|▊         | 4.00M/51.7M [00:01<00:08, 5.78MB/s]
 15%|█▌        | 8.00M/51.7M [00:01<00:03, 12.3MB/s]
 19%|█▉        | 10.0M/51.7M [00:01<00:03, 12.5MB/s]
 25%|██▌       | 13.0M/51.7M [00:01<00:02, 16.1MB/s]
 33%|███▎      | 17.0M/51.7M [00:01<00:01, 20.8MB/s]
 41%|████      | 21.0M/51.7M [00:01<00:01, 24.6MB/s]
 48%|████▊     | 25.0M/51.7M [00:01<00:01, 27.4MB/s]
 54%|█████▍    | 28.0M/51.7M [00:01<00:00, 28.0MB/s]
 62%|██████▏   | 32.0M/51.7M [00:02<00:00, 29.9MB/s]
 70%|██████▉   | 36.0M/51.7M [00:02<00:00, 31.1MB/s]
 77%|███████▋  | 40.0M/51.7M [00:02<00:00, 32.3MB/s]
 85%|████████▌ | 44.0M/51.7M [00:02<00:00, 32.6MB/s]
 93%|█████████▎| 48.0M/51.7M [00:02<00:00, 32.9MB/s]
100%|██████████| 51.7M/51.7M [00:02<00:00, 32.4MB/s]
100%|██████████| 51.7M/51.7M [00:02<00:00, 20.7MB/s]
