# 리뷰 긍정 부정 판단 머신러닝

### 0. 준비
- 가상환경 셋팅
- 필요 라이브러리 설치

In [2]:
## 데이터로드 
import pandas as pd

data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [4]:
from datasets import Dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [5]:
label2id = {'negative':0, 'positive':1}

dataset = dataset.map(lambda x: {'label':label2id[x['sentiment']]})
dataset

Map: 100%|██████████| 35000/35000 [00:00<00:00, 46115.19 examples/s]
Map: 100%|██████████| 15000/15000 [00:00<00:00, 49694.25 examples/s]


DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'label'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment', 'label'],
        num_rows: 15000
    })
})

In [6]:
dataset['train'][0:5]

{'review': ["This show is terrible. I cannot get over the complete waste of great talent this show contains. This is not entertaining improvisational acting, it's just a cheap attempt to throw someone famous comedic actors onto a stage and have them perform a poorly improved scene. I have actually done improv work as an actor, and this show is not improv.<br /><br />What the audience is actually laughing at (if they're actually laughing at this show at all, it looks quite fake) is the embarrassment of the guest star being lost like a deer in headlights. The dumb, completely unrelated things they come up with are what people laugh at. And if it's not part of the scene, the actors will tell them that it's wrong! I find this show is disgrace to the art, and makes me cry for shows like Whose Line is it Anyway, which had great talent, great improv games, and on top of everything else, didn't make me want to change the channel.",
  "What a horrible movie. This movie was so out of order and s

In [7]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device("cpu")

# Hugging Face에 저장되어 있는 모델을 불러올게요. TinyBERT 모델을 불러오겠습니다.
model = 'huawei-noah/TinyBERT_General_4L_312D'

tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
tokenizer



BertTokenizerFast(name_or_path='huawei-noah/TinyBERT_General_4L_312D', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
print(f"데이터셋의 데이터 : {dataset['train'][0]['review']}")
print(f"토크나이저로 변환된 데이터셋 : {tokenizer(dataset['train'][0]['review'])}")
print(f"토크나이저 기능 확인 : {tokenizer('Today is monday')}")

데이터셋의 데이터 : This show is terrible. I cannot get over the complete waste of great talent this show contains. This is not entertaining improvisational acting, it's just a cheap attempt to throw someone famous comedic actors onto a stage and have them perform a poorly improved scene. I have actually done improv work as an actor, and this show is not improv.<br /><br />What the audience is actually laughing at (if they're actually laughing at this show at all, it looks quite fake) is the embarrassment of the guest star being lost like a deer in headlights. The dumb, completely unrelated things they come up with are what people laugh at. And if it's not part of the scene, the actors will tell them that it's wrong! I find this show is disgrace to the art, and makes me cry for shows like Whose Line is it Anyway, which had great talent, great improv games, and on top of everything else, didn't make me want to change the channel.
토크나이저로 변환된 데이터셋 : {'input_ids': [101, 2023, 2265, 2003, 6659, 101

In [9]:
def tokenize(batch):
    temp = tokenizer(
        batch['review'], 
        padding=True, 
        truncation=True, 
        max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)
dataset

Map: 100%|██████████| 35000/35000 [00:05<00:00, 6193.96 examples/s]
Map: 100%|██████████| 15000/15000 [00:02<00:00, 6379.89 examples/s]


DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15000
    })
})

In [10]:
dataset['train'][0]

{'review': "This show is terrible. I cannot get over the complete waste of great talent this show contains. This is not entertaining improvisational acting, it's just a cheap attempt to throw someone famous comedic actors onto a stage and have them perform a poorly improved scene. I have actually done improv work as an actor, and this show is not improv.<br /><br />What the audience is actually laughing at (if they're actually laughing at this show at all, it looks quite fake) is the embarrassment of the guest star being lost like a deer in headlights. The dumb, completely unrelated things they come up with are what people laugh at. And if it's not part of the scene, the actors will tell them that it's wrong! I find this show is disgrace to the art, and makes me cry for shows like Whose Line is it Anyway, which had great talent, great improv games, and on top of everything else, didn't make me want to change the channel.",
 'sentiment': 'negative',
 'label': 0,
 'input_ids': [101,
  20

In [11]:
import evaluate
import numpy as np

accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1) # 예측값과 실제 레이블 데이터를 튜플로 입력

    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
from transformers import AutoModelForSequenceClassification

id2label = {0:'negative', 1:'positive'}

model = AutoModelForSequenceClassification.from_pretrained(
    model,
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import TrainingArguments, Trainer

# 모델 학습을 위한 하이퍼파라미터와 설정 정의
args = TrainingArguments(
    output_dir='train_dir',               # 학습 결과를 저장할 디렉터리
    overwrite_output_dir=True,            # 출력 디렉터리에 이미 있는 파일을 덮어쓸지 여부
    num_train_epochs=3,                   # 학습할 에포크(epoch) 수
    learning_rate=2e-5,                   # 학습률 (learning rate)
    per_device_train_batch_size=32,       # 각 디바이스(예: GPU)당 학습 배치 크기
    per_device_eval_batch_size=32,        # 각 디바이스당 평가 배치 크기
    evaluation_strategy='epoch'           # 평가 전략 (여기서는 매 에포크마다 평가)
)

# Trainer 객체를 생성하여 학습 및 평가를 관리
trainer = Trainer(
    model=model,                          # 학습할 모델
    args=args,                            # 학습 파라미터 설정
    train_dataset=dataset['train'],       # 학습에 사용할 데이터셋
    eval_dataset=dataset['test'],         # 평가에 사용할 데이터셋
    compute_metrics=compute_metrics,      # 평가 지표를 계산하는 함수
    tokenizer=tokenizer                   # 토크나이저 (텍스트를 토큰으로 변환하는 도구)
)



In [14]:
trainer.train()

 15%|█▌        | 500/3282 [02:23<13:07,  3.53it/s]

{'loss': 0.4592, 'grad_norm': 8.490633964538574, 'learning_rate': 1.695307739183425e-05, 'epoch': 0.46}


 30%|███       | 1000/3282 [04:45<10:53,  3.49it/s]

{'loss': 0.3554, 'grad_norm': 5.520948886871338, 'learning_rate': 1.3906154783668494e-05, 'epoch': 0.91}


                                                   
 33%|███▎      | 1094/3282 [05:55<10:10,  3.58it/s]

{'eval_loss': 0.3265502154827118, 'eval_accuracy': 0.8609333333333333, 'eval_runtime': 42.7699, 'eval_samples_per_second': 350.714, 'eval_steps_per_second': 10.966, 'epoch': 1.0}


 46%|████▌     | 1500/3282 [07:50<08:22,  3.54it/s]  

{'loss': 0.302, 'grad_norm': 15.028241157531738, 'learning_rate': 1.0859232175502743e-05, 'epoch': 1.37}


 61%|██████    | 2000/3282 [10:11<06:01,  3.55it/s]

{'loss': 0.289, 'grad_norm': 21.859106063842773, 'learning_rate': 7.81230956733699e-06, 'epoch': 1.83}


                                                   
 67%|██████▋   | 2188/3282 [11:47<04:44,  3.84it/s]

{'eval_loss': 0.294048547744751, 'eval_accuracy': 0.8768666666666667, 'eval_runtime': 42.5027, 'eval_samples_per_second': 352.919, 'eval_steps_per_second': 11.035, 'epoch': 2.0}


 76%|███████▌  | 2500/3282 [13:16<03:42,  3.51it/s]  

{'loss': 0.2615, 'grad_norm': 7.353476047515869, 'learning_rate': 4.765386959171238e-06, 'epoch': 2.29}


 91%|█████████▏| 3000/3282 [15:37<01:19,  3.53it/s]

{'loss': 0.2597, 'grad_norm': 9.855073928833008, 'learning_rate': 1.7184643510054846e-06, 'epoch': 2.74}


                                                   
100%|██████████| 3282/3282 [17:40<00:00,  3.10it/s]

{'eval_loss': 0.2930574119091034, 'eval_accuracy': 0.8802, 'eval_runtime': 42.8248, 'eval_samples_per_second': 350.264, 'eval_steps_per_second': 10.952, 'epoch': 3.0}
{'train_runtime': 1060.0341, 'train_samples_per_second': 99.053, 'train_steps_per_second': 3.096, 'train_loss': 0.3149757664208002, 'epoch': 3.0}





TrainOutput(global_step=3282, training_loss=0.3149757664208002, metrics={'train_runtime': 1060.0341, 'train_samples_per_second': 99.053, 'train_steps_per_second': 3.096, 'total_flos': 882184338000000.0, 'train_loss': 0.3149757664208002, 'epoch': 3.0})

In [None]:
# CPU vs GPU의 차이

# 간단한 덧셈 문제 100개 문제를 누가 더 빨리 풀까요?
# - 대학생 1명(CPU) vs 초딩 100명(GPU)
# - NVIDIA가 왜 미친듯이 올랐죠? => GPU만들 잖아요. 

# GPU랑 딥러닝(단순 행렬 계산)은 뭔 상관이지?
# - 간단한 덧셈 문제 푸는데 대학생 1명 (시급 100만원) -> 초딩100명(시급1만원)

# 다굴엔 장사없다. => 포폴이 졸라 많으면 되요. 포폴 1개 인것 보다 2개 => 3개 => 4개 // 앱 10개(계산기...) => 코테 안봅니다.
# 회사에서 메일이 올 때 코테(필터) 없이 바로 면접을 보자고 해요.
# 코테 30분 미만으로 머리 식히는 용으로 => 나머지 진짜 공부를 하세요.
# DE 

In [None]:
# requirements.txt => pip install # 11시
# pip install -r requirements.txt

In [15]:
trainer.evaluate()

100%|██████████| 469/469 [00:43<00:00, 10.85it/s]


{'eval_loss': 0.2930574119091034,
 'eval_accuracy': 0.8802,
 'eval_runtime': 43.4409,
 'eval_samples_per_second': 345.296,
 'eval_steps_per_second': 10.796,
 'epoch': 3.0}

In [16]:
trainer.save_model('tinybert-sentiment-analysis')

In [17]:
from transformers import pipeline
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device=device)

In [29]:
# 모델 사용 예 1
data = [
    "good",
    "bad",
    "good"
]

classifier(data)

[{'label': 'positive', 'score': 0.975800096988678},
 {'label': 'negative', 'score': 0.9816442131996155},
 {'label': 'positive', 'score': 0.975800096988678}]

In [22]:
# 모델 사용 예 2
data=[
    """When are you guys going to fix all the issues?? Firstly, none of the reaction emojis are showing up. It's just a grey circle. When scrolling, it doesn't move freely. There's like a delay!! Very frustrating!!! Also, nearly every post is either from a "suggested page" or a "sponsered page". I hardly ever see anything from the pages that I actually follow or my friends pages. No wonder so many people are leaving FB 🙄🙄"""
]

classifier(data)

# 예 : 프로그램 : 구글 플레이 스토어 링크를 넣으면 > 리뷰 데이터 전체 크롤링 > 부정의 강도가 0.8 이상인 리뷰만 필터 걸러서 고객사에게 공유
# 별 1개에 네거티브 0.8이상 > slack으로 알림보내기 > (나)대응

[{'label': 'negative', 'score': 0.9817873239517212}]

In [None]:
# 모델을 s3에 업로드
# 1. AWS 로그인 한 다음 > 버킷 생성
# 2. boto3를 활용해서 코드 베이스 s3 생성 및 파일 업로드
# !pip install boto3

### AWS buckets 생성 및 배포
1. IAM 계정 생성 / EC2 - pem키 생성
2. AWS CLI 명령어 설치
https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html

> aws version
> aws configure
```
AWS Access Key ID [****************SK6E]: AKIA4SDNV3Y4IRQFSK6E
AWS Secret Access Key [****************Eyxt]: 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
Default region name [019mY+oLFtxNVxyMayCYZdSdbXeb2FwxWu5LaLxT]:  ap-northeast-2
Default output format [json]: json
```
> aws confiure list

#### aws cli 설치
- curl "https://awscli.amazonaws.com/AWSCLIV2.pkg" -o "AWSCLIV2.pkg"
- sudo installer -pkg AWSCLIV2.pkg -target /

#### 설치 확인
- which aws
/usr/local/bin/aws 
- aws --version
aws-cli/2.17.20 Python/3.11.6 Darwin/23.3.0 botocore/2.4.5

In [38]:
# boto3를 통해 AWS s3 bucket을 활용해 버킷 생성

import boto3
import time
from botocore.exceptions import ClientError

s3 = boto3. client('s3') # s3 콘솔에 접속
bucket_name = 'nam_models_bucket'

def create_bucket(bucket_name):
    response = s3.list_buckets()

    bucket_list = []
    for bucket in response["Buckets"]:
        bucket_list.append(bucket_list["Name"])

    if bucket_name not in bucket_list:
        try:
            s3.create_bucket(
                Bucket=bucket_name,
                CreateBucketConfiguration={'LocationConstraint':'ap-northeast-2'}
            )
        except ClientError as e:
            print('오류 발생 :', e)

            if e.response['Error']['Code'] == 'BucketAlreadyExists':    
                print('다른 버킷 이름을 입력하세요.')

            elif e.reponse['Error']['Code'] == 'BucketAlreadyOwnedByYou':
                print('이미 만들어 져있는 버킷입니다.')
                
            else:
                print('버킷 만들기를 재시도 중입니다.')
                time.sleep(3)
                create_bucket(bucket_name)

# aws credentials
# aws configure list

## S3에 파일 업로드

In [40]:
import boto3
import os

s3 = boto3.client('s3')
bucket_name = '112test-bucket'
file_path = 'tinybert-sentiment-analysis'

# s3.upload_file() # 오직 파일만 (폴더 안됨))

# 폴더 안에 있는 모든 파일 업로드
def s3_upload_file_folder_name(model_folder, folder_name):
    for root, dir, files in os.walk(model_folder):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            s3_key = os.path.join(folder_name, file_name)
            s3.upload_file(file_path, bucket_name, s3_key)

s3_upload_file_folder_name(file_path, 'tinybert-test-folder')
