In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re

In [None]:
train=pd.read_csv('../input/dacon9/open/train.csv')
test=pd.read_csv('../input/dacon9/open/test_x.csv')
submit=pd.read_csv('../input/dacon9/open/sample_submission.csv')

In [None]:
# author의 분포를 확인했다. balance인지 모르겠다.

temp=train['author'].value_counts()

fig,ax=plt.subplots(1,1,figsize=(10,10),dpi=200)

ax.bar(temp.index,temp.values)

plt.show()

In [None]:
# 단어의 수를 봤다. 많은지 모르겠다.

temp=train['text'].apply(lambda x:len(x.split()))

temp.describe()

### tf-idf

#### 사용하는 이유

1. 특징 추출 - 수치화
2. countvectorizer도 있다. 이건 조사같이 의미없는 단어에 높은 점수를 줄 수 있다는 한계점이 있다. 
3. 즉, tf가 advantage, idf가 disadvantage. 

In [None]:
# 출처: https://chan-lab.tistory.com/24?category=810217 [은공지능 공작소]

from sklearn.feature_extraction.text import TfidfVectorizer
text = ['I go to my home my home is very large', # Doc[0]
        'I went out my home I go to the market', # Doc[1] 
        'I bought a yellow lemon I go back to home'] # Doc[2] 
tfidf_vectorizer = TfidfVectorizer() # TF-IDF 객체선언

In [None]:
tfidf_vectorizer.fit(text) # 단어를 학습시킴 
# tfidf_vectorizer.vocabulary_ # 단어사전을 출력 
# sorted(tfidf_vectorizer.vocabulary_.items()) # 단어사전 정렬

# tfidf_vectorizer.idf_ # idf 
tfidf_vectorizer.transform(text).toarray() # 최종 tf-idf

#### baseline 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import log_loss, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [None]:
train['text']=train['text'].apply(lambda x:x.lower())
# train['text']=train['text'].apply(lambda x:re.sub(r'[\.,“”!?]','',x))

In [None]:
train=train.drop(['index'],axis=1)

In [None]:
X_tr,X_val,y_tr,y_val=train_test_split(train['text'],train['author'],test_size=0.2,
                                               stratify=train['author'],random_state=71)
print(X_tr.shape,X_val.shape,y_tr.shape,y_val.shape)

In [None]:
vectorizer=TfidfVectorizer(min_df=2)
scaler=MaxAbsScaler()
classifier=BernoulliNB(binarize=0, alpha=1)

In [None]:
pl=make_pipeline(vectorizer,classifier)
pl.fit(X_tr,y_tr)

In [None]:
preds_prob=pl.predict_proba(X_val)
preds_class=pl.predict(X_val)

In [None]:
nb_loss=log_loss(y_val,preds_prob)
print(nb_loss)

In [None]:
print(confusion_matrix(y_val,preds_class))
print('\n')
print(classification_report(y_val,preds_class))

#### ALBERT

In [None]:
!pip install tensorflow_text==2.3

In [None]:
import tensorflow
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
def model():
    text_input=tensorflow.keras.layers.Input(shape=(),dtype=tensorflow.string)
    preprocessing_layer=hub.KerasLayer("http://tfhub.dev/tensorflow/albert_en_preprocess/2")
    encoder_inputs=preprocessing_layer(text_input)
    encoder=hub.KerasLayer('https://tfhub.dev/tensorflow/albert_en_base/2',trainable=True)
    outputs=encoder(encoder_inputs)
    pooled_output=outputs["pooled_output"]
    output=tensorflow.keras.layers.Dropout(0.5)(pooled_output)
    output=tensorflow.keras.layers.Dense(5,activation='softmax')(output)
    return tensorflow.keras.Model(text_input,output)

In [None]:
text_model=model()

In [None]:
!pip install -q tf-models-official
from official.nlp import optimization

In [None]:
tf_train,tf_val=train_test_split(train,test_size=0.2,stratify=train['author'],random_state=71)
print(tf_train.shape,tf_val.shape)

In [None]:
def df_to_dataset(df,batch_size=47):
    temp=df.copy()
    targets=temp.pop('author')
    ds=tensorflow.data.Dataset.from_tensor_slices((dict(temp),targets))
    ds=ds.batch(batch_size)
    return ds

tr_ds=df_to_dataset(tf_train)
val_ds=df_to_dataset(tf_val)

In [None]:
epochs=3
steps_per_epoch=tensorflow.data.experimental.cardinality(tr_ds).numpy()
num_train_steps=steps_per_epoch*epochs
num_warmup_steps=int(0.1*num_train_steps)

optimizer=optimization.create_optimizer(init_lr=3e-5,
                                        num_train_steps=num_train_steps,
                                        num_warmup_steps=num_warmup_steps,
                                        optimizer_type='adamw')

In [None]:
text_model.compile(optimizer=optimizer,
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

In [None]:
# batch_size가 크면 그만큼 기억할게 많아지니까 out of memory 나고.
# 그렇다고 batch_size가 작아지면 돌릴게 많아지니까 memory가 남아도 느리게 놀아가고. 

# 일단 이렇게 돌리면 13시간 나온다. 

with tensorflow.device('/device:GPU:0'):
    history=text_model.fit(x=tr_ds,validation_data=val_ds,batch_size=47,epochs=3)

In [None]:
text_model.evaluate(tr_ds,batch_size=47)
text_model.evaluate(val_ds,batch_size=47)

In [None]:
tf_preds=text_model.predict(val_ds)
albert_loss=log_loss(y_val,tf_preds)
print(albert_loss)

tf_preds_class=np.argmax(tf_preds,axis=1)

print(confusion_matrix(y_test, tf_preds_class))
print('\n')
print(classification_report(y_test, tf_preds_class))

In [None]:
preds=text_model.predict(test['text'].values)
preds_df=pd.DataFrame(data=preds)

In [None]:
submit=pd.concat([test,preds_df],axis=1)
submit.drop(columns='text',inplace=True)
submit.to_csv('submit.csv',index=False,index_label=False)