In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertModel
import torch

In [3]:
# nltk의 불용어와 표제어 추출 준비
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SSAFY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SSAFY\AppData\Roaming\nltk_data...


In [4]:
# BERT 모델과 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [5]:
# CSV 파일 로드
data = pd.read_csv('cefr_leveled_texts.csv')

In [6]:
# 텍스트 전처리 함수 정의
def preprocess_text(text):
    # 1. 토큰화 및 소문자 변환
    tokens = text.lower().split()
    
    # 2. 불용어 제거
    tokens = [word for word in tokens if word not in stop_words]
    
    # 3. 표제어 추출
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # 전처리된 텍스트 반환
    return ' '.join(tokens)

In [7]:
# 데이터에 전처리 적용
data['processed_text'] = data['text'].apply(preprocess_text)

In [8]:
# BERT 임베딩 함수 정의
def get_bert_embeddings(text):
    # 입력 텍스트를 BERT의 입력 형식으로 변환
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # 임베딩 결과는 모델의 마지막 은닉층
    embeddings = outputs.last_hidden_state.mean(dim=1)  # 평균 풀링으로 문장 벡터 생성
    return embeddings.squeeze().numpy()

In [9]:
# BERT 임베딩 생성 및 적용
data['bert_embeddings'] = data['processed_text'].apply(get_bert_embeddings)

# 결과 확인
print(data[['text', 'label', 'processed_text', 'bert_embeddings']].head())

                                                text label  \
0  Hi!\r\nI've been meaning to write for ages and...    B2   
1  ﻿It was not so much how hard people found the ...    B2   
2  Keith recently came back from a trip to Chicag...    B2   
3  The Griffith Observatory is a planetarium, and...    B2   
4  -LRB- The Hollywood Reporter -RRB- It's offici...    B2   

                                      processed_text  \
0  hi! i've meaning write age finally today i'm a...   
1  ﻿it much hard people found challenge far would...   
2  keith recently came back trip chicago, illinoi...   
3  griffith observatory planetarium, exhibit hall...   
4  -lrb- hollywood reporter -rrb- official: amc's...   

                                     bert_embeddings  
0  [-0.22955546, -0.0098610325, 1.032207, -0.1835...  
1  [-0.2589494, 0.044663645, 0.6809145, -0.111927...  
2  [-0.44110575, 0.12668371, 0.5283675, -0.103395...  
3  [-0.28058243, 0.2910775, 0.62295926, -0.081059...  
4  [-0.23377775

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier

In [11]:
# 전처리된 데이터 로드
# data = pd.read_csv('./cefr_leveled_texts.csv')  # 실제 파일명으로 변경하세요
X = np.stack(data['bert_embeddings'].values)   # BERT 임베딩을 사용한 입력 데이터
y = data['label'].values                       # CEFR 레벨 라벨

In [12]:
# 데이터 분할 (80% 학습용, 20% 테스트용)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
training_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)

In [14]:
# 교차 검증을 통한 성능 평가
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(training_model, X_train, y_train, cv=kfold, scoring='accuracy')

print("K-Fold Cross-Validation Accuracy:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())

K-Fold Cross-Validation Accuracy: [0.64016736 0.62761506 0.60669456 0.58995816 0.56066946]
Mean Cross-Validation Accuracy: 0.605020920502092


In [15]:
# 최종 학습 및 예측
training_model.fit(X_train, y_train)
y_pred = training_model.predict(X_test)

# 모델 평가
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Test Accuracy: 0.6655518394648829
Classification Report:
               precision    recall  f1-score   support

          A1       0.83      0.85      0.84        67
          A2       0.73      0.69      0.71        52
          B1       0.61      0.47      0.53        36
          B2       0.53      0.74      0.62        54
          C1       0.57      0.41      0.48        51
          C2       0.70      0.72      0.71        39

    accuracy                           0.67       299
   macro avg       0.66      0.65      0.65       299
weighted avg       0.67      0.67      0.66       299



In [16]:
# 영화 대사 난이도 예측 함수
def predict_difficulty(dialogues):
    dialogues_processed = [preprocess_text(dialogue) for dialogue in dialogues]  # 전처리 적용
    embeddings = [get_bert_embeddings(dialogue) for dialogue in dialogues_processed]  # BERT 임베딩 적용
    predictions = training_model.predict(embeddings)
    return predictions

In [17]:
# 예시: 새로운 영화 대사 난이도 예측
new_dialogues = ["In view of this, the Ethiopian Government and other developmental partners have introduced an extensive mechanical and biological watershed conservation schemes in various parts of the country over the last decades particularly after the famine of the 1970s"]
difficulty_predictions = predict_difficulty(new_dialogues)
print("Predicted CEFR Levels:", difficulty_predictions)

Predicted CEFR Levels: ['C2']
