# LLM(거대 언어 모델)

## NLP(자연어 처리)

In [1]:
#사전 설치 :pip install konlpy
from konlpy.tag import Okt
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

#텍스트 데이터 (입력문장)
sentences=[
    "자연어 처리는 재미있는 분야입니다.",
    "딥러닝은 많은 데이터를 필요로 합니다.",
    "한국어 NLP는 정말 재미있어요!"
];

#토크나이징
okt=Okt();
tokenized_sentences=[okt.morphs(sentence) for sentence in sentences];
print("토크나이징 결과: ", tokenized_sentences);


토크나이징 결과:  [['자연어', '처리', '는', '재미있는', '분야', '입니다', '.'], ['딥', '러닝', '은', '많은', '데이터', '를', '필요', '로', '합니다', '.'], ['한국어', 'NLP', '는', '정말', '재미있어요', '!']]


In [2]:
#인코딩: 단어를 숫자로 변환
tokenizer=Tokenizer();
tokenizer.fit_on_texts(tokenized_sentences);
encoded_sentences=tokenizer.texts_to_sequences(tokenized_sentences);
print("인코딩 결과:", encoded_sentences);

인코딩 결과: [[3, 4, 1, 5, 6, 7, 2], [8, 9, 10, 11, 12, 13, 14, 15, 16, 2], [17, 18, 1, 19, 20, 21]]


In [3]:
#패딩(padding): 길이를 맞추기 위해 0으로 채우기
max_len=10  #최대길이 설정
padded_sentences=pad_sequences(encoded_sentences, maxlen=max_len, padding="post");
print("패딩 결과:", padded_sentences);

패딩 결과: [[ 3  4  1  5  6  7  2  0  0  0]
 [ 8  9 10 11 12 13 14 15 16  2]
 [17 18  1 19 20 21  0  0  0  0]]


In [6]:
#임베딩(Embedding)
vocab_size=len(tokenizer.word_index)+1  #단어 사전 크기
embedding_dim=8     #임베딩 차원 크기

In [7]:
#간단한 임베딩 모델 생성
model=Sequential();
model.add (Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len));
model.compile("rmsprop", "mse");




In [14]:
#패딩된 문장을 임베딩 층에 통과
embeddings=model.predict(padded_sentences);
print("임베딩 결과(첫번째 문장):\n", embeddings[0]);
print("\n",embeddings[1]);
print("\n", embeddings[2]);


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
임베딩 결과(첫번째 문장):
 [[ 0.03785     0.02770555  0.01835119  0.01561788  0.03928561  0.00935874
   0.01045211  0.0094878 ]
 [ 0.03607792 -0.01949482 -0.04625165 -0.04639664  0.00307175 -0.04118716
  -0.006867    0.00686441]
 [ 0.02437948 -0.02414671  0.04831756  0.04589019 -0.01767214 -0.02486076
  -0.02759489 -0.02554921]
 [ 0.02879046  0.01846165  0.03845319 -0.00784381 -0.00483923 -0.02904035
   0.01024349 -0.01850778]
 [ 0.01767942  0.02069113  0.01443192 -0.00825565  0.00944923  0.0015115
   0.0319241   0.04393909]
 [-0.01904782 -0.01613797 -0.00510596  0.00372597 -0.0250793  -0.03565183
   0.01453916  0.03772693]
 [ 0.0078523  -0.04798561 -0.01991482  0.03380628 -0.02201792  0.04673639
   0.0421188   0.02592966]
 [-0.04754175 -0.02237874  0.03572438  0.01527183  0.04651963  0.03915666
   0.03639365  0.032008  ]
 [-0.04754175 -0.02237874  0.03572438  0.01527183  0.04651963  0.03915666
   0.03639365  0.032008  ]
 [-

## 트랜스포머(Transformer)

### Huggin Face를 사용한  BERT 테스트

In [24]:
# 감정분석(zero-shot classification)
### transformers 라이브러리 사전 설치 : pip install transformers
### tf-keras 라이브러리 사전 설치치 : pip install tf-keras
from transformers import pipeline

classifier=pipeline("sentiment-analysis");
classifier("오늘 기분이 좋아요")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Device set to use 0


[{'label': 'POSITIVE', 'score': 0.8848785758018494}]

In [25]:
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much"]
)

[{'label': 'POSITIVE', 'score': 0.9598047137260437},
 {'label': 'NEGATIVE', 'score': 0.9995144605636597}]

In [26]:
#텍스트 생성(text generation)
generator=pipeline("text-generation");
generator("In this course, we will teach you how to")

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Device set to use 0


[{'generated_text': 'In this course, we will teach you how to use the Google Assistant for the job and how to build productivity apps.\n\nHow to Build a Pro-Workout with Google Assistant\n\nWe will have three parts for the job:\n\n'}]

In [19]:
#question-answering
question_answer=pipeline("question-answering");
question_answer(
    question="Where do I work?",
    context="My name is Sylvain and I work at Hugging Face in Brooklyn",
)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.
Device set to use 0


{'score': 0.6949762105941772, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}

In [20]:
#요약(Summarization)
summarizer=pipeline("summarization");
summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of
    graduates in traditional engineering disciplines such as mechanical, civil,
    electrical, chemical, and aeronautical engineering declined, but in most of
    the premier American universities engineering curricula now concentrate on
    and encourage largely the study of engineering science. As a result, there
    are declining offerings in engineering subjects dealing with infrastructure,
    the environment, and related issues, and greater concentration on high
    technology subjects, largely supporting increasingly complex scientific
    developments. While the latter is important, it should not be at the expense
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other
    industrial countries in Europe and Asia, continue to encourage and advance
    the teaching of engineering. Both China and India, respectively, graduate
    six and eight times as many traditional engineers as does the United States.
    Other industrial countries at minimum maintain their output, while America
    suffers an increasingly serious decline in the number of engineering graduates
    and a lack of well-educated engineers.
    """
)

No model was supplied, defaulted to google-t5/t5-small and revision df1b051 (https://huggingface.co/google-t5/t5-small).
Using a pipeline without specifying a model name and revision in production is not recommended.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0


[{'summary_text': 'the number of graduates in traditional engineering disciplines has declined . in most of the premier american universities engineering curricula now concentrate on and encourage largely the study of engineering science . rapidly developing economies such as China and India continue to encourage and advance the teaching of engineering .'}]