<a href="https://colab.research.google.com/github/Vaycold/statistics_seminar/blob/main/covid19_tweet/auto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import os
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings 
import re
from google.colab import files

warnings.filterwarnings('ignore')

os.environ['KAGGLE_USERNAME'] = 'kimgodbang'
os.environ['KAGGLE_KEY'] = '1c0e1716b23829d4381dcbced37ba49b'

!rm *.*
!kaggle competitions download -c sentiment-analysis-of-covid-19-related-tweets

train_df = pd.read_csv('training.csv')
test_df = pd.read_csv('validation.csv')

!pip install sentencepiece
import sentencepiece as spm

Downloading training.csv to /content
  0% 0.00/518k [00:00<?, ?B/s]
100% 518k/518k [00:00<00:00, 33.9MB/s]
Downloading validation.csv to /content
  0% 0.00/254k [00:00<?, ?B/s]
100% 254k/254k [00:00<00:00, 78.3MB/s]


In [32]:
train_df.head()

Unnamed: 0,ID,Tweet,Labels
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,0 10
1,2,BanMediaHouse whose is responsible for spreadi...,6
2,3,Im waiting for someone to say to me that all t...,3 4
3,4,He is a liar. Proven day night. Time again. Li...,6
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000...",8


In [33]:
def labelsplit(labels) :
    
    label = labels.replace(' ',',').split(',')
    for idx, lab in enumerate(label) :
        label[idx] = int(lab)
    return label

In [34]:
train_df['label']  = train_df['Labels'].apply(lambda x :  labelsplit(x))
train_df.head(2)

Unnamed: 0,ID,Tweet,Labels,label
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,0 10,"[0, 10]"
1,2,BanMediaHouse whose is responsible for spreadi...,6,[6]


In [35]:
onehot = np.zeros((5000,11))
for i in range(5000) : # 0~5000 row
    for j in train_df.label[i] : # j = value   
        onehot[i,j]  = 1

onehot = pd.DataFrame(onehot, columns=['0','1','2','3','4','5','6','7','8','9','10'])
onehot[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
train_df = pd.concat(
    [train_df, onehot], axis=1
).drop(['ID','Labels', 'label'],axis=1)

train_df.head(2)

Unnamed: 0,Tweet,0,1,2,3,4,5,6,7,8,9,10
0,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,BanMediaHouse whose is responsible for spreadi...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [37]:
full_text = np.concatenate(
    [train_df.Tweet.values, test_df.Tweet.values]
)
with open('./full_text.txt', 'w', encoding='utf-8') as f:
    for line in full_text :
        if len(str(line).strip()) == 0 : continue
        try :
            f.write(line+'\n')
        except : continue

In [38]:
spm.SentencePieceTrainer.train('--input=full_text.txt --model_prefix=m --vocab_size=10000')

In [39]:
sp = spm.SentencePieceProcessor()
sp.load('m.model')

True

In [40]:
train_df.head()

Unnamed: 0,Tweet,0,1,2,3,4,5,6,7,8,9,10
0,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,BanMediaHouse whose is responsible for spreadi...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,Im waiting for someone to say to me that all t...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,He is a liar. Proven day night. Time again. Li...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,"NEW: U.S. CoronaVirus death toll reaches 4,000...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [42]:
train_df['bow'] = train_df['Tweet'].apply(lambda x : sp.encode_as_ids(str(x)))
train_df.head(2)

Unnamed: 0,Tweet,0,1,2,3,4,5,6,7,8,9,10,bow
0,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"[1444, 1527, 10, 1656, 8082, 1184, 80, 7231, 6..."
1,BanMediaHouse whose is responsible for spreadi...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,"[2607, 5959, 8833, 102, 588, 14, 1753, 493, 20..."


In [43]:
train_text = train_df.bow.values
train_text[:2]

array([list([1444, 1527, 10, 1656, 8082, 1184, 80, 7231, 6888, 46, 174, 8982, 180, 19, 16, 382, 148]),
       list([2607, 5959, 8833, 102, 588, 14, 1753, 493, 20, 499, 3184, 15, 2602, 5765, 7953, 18, 21, 156, 23, 545, 3])],
      dtype=object)

In [45]:
train_bow_text = tf.keras.preprocessing.sequence.pad_sequences(train_text, maxlen=70)

In [47]:
train_bow_text.shape

(5000, 70)

In [65]:
test_df['bow'] = test_df['Tweet'].apply(lambda x : sp.encode_as_ids(str(x)))
test_text = test_df.bow.values
test_bow_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=70)

# 안나누기

In [46]:
target

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], dtype='object')

In [71]:
# model

from keras.layers import Input, Embedding, GRU, Dense
from keras.models import Model
def seq2seq() :
    input_x_bow = Input(shape=(70,))
    embedding = Embedding(11412,120)
    x = embedding(input_x_bow)
    z = GRU(64)(x)

    y = Dense(2, activation = 'softmax')(z)

    model = Model(input_x_bow, y)
    model.compile(
        loss = 'categorical_crossentropy',
        optimizer = 'adam',
        metrics= ['accuracy']
    )
    return model

In [72]:
model = seq2seq()

In [75]:
def auto() :
    df = pd.DataFrame()
    for i in target :
        sentiment = train_df[i].values
        onehot_centiment = tf.keras.utils.to_categorical(sentiment)
        p0 = (onehot_centiment.sum(axis=0))[0]/sum(onehot_centiment.sum(axis=0))
        p1 = (onehot_centiment.sum(axis=0))[1]/sum(onehot_centiment.sum(axis=0))
        model = seq2seq()
        model.fit(train_bow_text, onehot_centiment, 
                  epochs = 1,
                  verbose = 0,
                  class_weight = {0 : p0 , 1 : p1})
        answer = model.predict(test_bow_text)
        Answer = pd.DataFrame(answer, columns = ['0','1'])
        df[i] = Answer['0'].apply(lambda x : 0 if x >= 0.5 else 1)
        print(f'label : {i} is over' )
    return df


        

In [54]:
tf.keras.utils.to_categorical(train_df['0'].values).sum(axis=0)[0]

array([3820., 1180.], dtype=float32)

In [56]:
centi = tf.keras.utils.to_categorical(train_df['0'].values)
centi

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [64]:
(centi.sum(axis=0))[0]/sum(centi.sum(axis=0))

0.764

In [77]:
plz = auto()
plz

label : 0 is over
label : 1 is over
label : 2 is over
label : 3 is over
label : 4 is over
label : 5 is over
label : 6 is over
label : 7 is over
label : 8 is over
label : 9 is over
label : 10 is over


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
2495,0,0,0,0,0,0,0,0,0,0,0
2496,0,0,0,0,0,0,0,0,0,0,0
2497,0,0,0,0,0,0,0,0,0,0,0
2498,0,0,0,0,0,0,0,0,0,0,0


In [78]:
plz.shape

(2500, 11)

In [79]:
for i in target :
    print( plz[i].value_counts())

0    2499
1       1
Name: 0, dtype: int64
0    2500
Name: 1, dtype: int64
0    2500
Name: 2, dtype: int64
0    2500
Name: 3, dtype: int64
0    2500
Name: 4, dtype: int64
0    2500
Name: 5, dtype: int64
0    2438
1      62
Name: 6, dtype: int64
0    2500
Name: 7, dtype: int64
0    2420
1      80
Name: 8, dtype: int64
0    2500
Name: 9, dtype: int64
0    1858
1     642
Name: 10, dtype: int64
