<a href="https://colab.research.google.com/github/TaeGuSeo/Terms-and-Conditions-Analysis-System/blob/main/%EC%9C%A0%EB%B6%88%EB%A6%AC_%ED%8C%90%EB%8B%A8_%EB%AA%A8%EB%8D%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## 유불리 판단 모델

# 입력받은 약관 전문을 문자열로 나눈 뒤, 문장별로 사용자에게 유리한지 불리한지를 판단하여 유리조항리스트와 불리조항리스트로 추출하는 과정에 필요한 모델.
# 초기 일렉트라, 버트, 코버트를 사용해보았지만 학습 과정에서의 손실과 정확도를 비교해보았을 때, 버트쪽 모델이 사용하기에 적합할 것으로 보여 버트, 코버트를 사용하여 수행

In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import cv2
import pytesseract
import numpy as np
import re
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, set_seed
from torch.utils.data import DataLoader, Dataset
import torch
import os
import nltk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import copy

set_seed(123)

app = Flask(__name__)
CORS(app)

pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

# 유불리 모델 초기화
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", do_lower_case=False)
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

model_path = '.\model_state_dict.pkl'
state_dict = torch.load(model_path, map_location=torch.device('cpu'))
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(state_dict)
model.eval()



'''
# 요약 모델 초기화
nltk.download('punkt')
tokenizer_summ = AutoTokenizer.from_pretrained('eenzeenee/t5-base-korean-summarization')
model_summ = AutoModelForSeq2SeqLM.from_pretrained('eenzeenee/t5-base-korean-summarization')
model_summ.eval()
'''

# 요약 모델 초기화
nltk.download('punkt')
tokenizer_summ = AutoTokenizer.from_pretrained('Youngwoo9/T5_Pyeongsan')
model_summ = AutoModelForSeq2SeqLM.from_pretrained('Youngwoo9/T5_Pyeongsan')
model_summ.eval()

# 요약 리스트 초기화
ad_sentences = []
disad_sentences = []
ad_sum_sentences = []
disad_sum_sentences = []

# 제목, 키워드 리스트 초기화
title = ['미지정']
keywords =['미지정1', '미지정2', '미지정3']



@app.route('/api/upload', methods=['POST'])
def handle_image_upload():
    if 'image' not in request.files:
        return jsonify({'error': '이미지를 제공하지 않았습니다.'}), 400
    image = request.files['image']
    image = cv2.imdecode(np.frombuffer(image.read(), np.uint8), cv2.IMREAD_COLOR)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray_image, lang='kor')


    print('추출된 텍스트:', text)
    response_data = {'text': text}

    return jsonify(response_data)


@app.route('/api/process', methods=['POST'])
def process_text():
    data = request.json
    sentences = data.get('sentences', [])
    results = copy.deepcopy(sentences)

    # 리스트 초기화
    ad_sentences.clear()
    disad_sentences.clear()
    ad_sum_sentences.clear()
    disad_sum_sentences.clear()

    # input data 전처리
    sentences_input = ["[CLS] " + str(s) + " [SEP]" for s in sentences]


    class ClassificationCollator(object):
        # 토크나이저, 라벨 인코더, 최대 시퀀스 길이 설정 & 객체 초기화
        def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):
            self.use_tokenizer = use_tokenizer
            self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
            self.labels_encoder = labels_encoder
            return

        # 설정된 토크나이저, 라벨 인코더, 최대 시퀀스 길이를 이용해 데이터를 전처리
        def __call__(self, sequences):
            texts = sequences
            inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)

            return inputs

    Classificaiton_collator = ClassificationCollator(use_tokenizer=tokenizer,
                                                          labels_encoder=2,
                                                          max_sequence_len=128)

    print('Dealing with Data...')
    sentences_input = DataLoader(sentences_input, collate_fn=Classificaiton_collator)
    print('Created `sentences` with %d batches!'%len(sentences_input))


    def predict_sentiment(tensor, model, tokenizer):
        inputs = tensor
        model.eval()

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        probabilities = torch.softmax(logits, dim=1)
        prob_1, prob_2 = probabilities[0]

        return prob_1.item(), prob_2.item()


    for tensor, text in zip(sentences_input, sentences) :
        print('들어온 문장 : ',text,'\n')
        # import pdb; pdb.set_trace()
        print()
        prob_1, prob_2 = predict_sentiment(tensor, model, tokenizer)
        print('유리확률 : ',prob_1,'\n')
        print()
        print('불리확률 : ',prob_2,'\n')
        print()
        if prob_1 > 0.995:
            ad_sentences.append(text)
        if prob_2 > 0.995:
            disad_sentences.append(text)

    print(ad_sentences)
    print(disad_sentences)

    prefix = "summarize: "

    for sentence in ad_sentences:
        if not sentence.endswith('.'):
            ad_sum_sentences.append(sentence)
        else:
            input_text = prefix + sentence

            inputs = tokenizer_summ(input_text, max_length=512, truncation=True, return_tensors="pt")
            output = model_summ.generate(**inputs, num_beams=3, do_sample=True, min_length=1, max_length=len(sentence))
            decoded_output = tokenizer_summ.batch_decode(output, skip_special_tokens=True)[0]
            decoded_output = decoded_output.strip()

            if decoded_output:
                sentences_summ = nltk.sent_tokenize(decoded_output)
            if sentences_summ:
                result = sentences_summ[0]
                ad_sum_sentences.append(result)

    for sentence in disad_sentences:
        if not sentence.endswith('.'):
            disad_sum_sentences.append(sentence)
        else:
            input_text = prefix + sentence

            inputs = tokenizer_summ(input_text, max_length=512, truncation=True, return_tensors="pt")
            output = model_summ.generate(**inputs, num_beams=3, do_sample=True, min_length=1, max_length=len(sentence))
            decoded_output = tokenizer_summ.batch_decode(output, skip_special_tokens=True)[0]
            decoded_output = decoded_output.strip()
            if decoded_output:
                sentences_summ = nltk.sent_tokenize(decoded_output)
            if sentences_summ:
                result = sentences_summ[0]
                disad_sum_sentences.append(result)

    print(ad_sum_sentences)
    print(disad_sum_sentences)

    return jsonify({'text': results})


@app.route('/process', methods=['GET'])
def process():
    response_data = {
        'positive_texts': ad_sentences,
        'negative_texts': disad_sentences,
        'sum_p_texts': ad_sum_sentences,
        'sum_n_texts': disad_sum_sentences
        }
    return jsonify(response_data)


@app.route('/title', methods=['GET'])
def title():
    title_texts = ["001_온라인게임_가공"]
    response_data = {
            'title_texts': title_texts
        }

    return jsonify(response_data)


@app.route('/keyword', methods=['GET'])
def keyword():
    keyword_texts = ["001_온라인게임_가공"]

    response_data = {
            'keyword_texts': keyword_texts
        }

    return jsonify(response_data)


if __name__ == "__main__":
    app.run('0.0.0.0', port=5000, threaded=True)