# [Module 1.0] Tweet 입력 데이터 클린징 및 S3에 업로드

이 노트북에서는 아래와 같은 작업을 진행 합니다.
- Tweet 데이타를 클린징
- Tweet 데이타 통계 확인
- 총 990개의 이모티콘 레이블을 10개로 한정 함
- Emoticon to Index의 사전 생성
- Tweet Data를 Train 및 Test로 분리
    - Train_01_text, Train_01_label, Train_02_text, Train_02_label 분리
    - Test text, Test label로 분리
- Train_01_text, Train_01_label --> tweet_file_01.csv.gz 후 S3에 업로드
- Train_02_text, Train_02_label --> tweet_file_02.csv.gz 후 S3에 업로드

총 소요시간은 약 1분 걸립니다.

In [1]:
import pandas as pd
import re
import boto3
import numpy as np
import os


## Look at Raw Tweet Data
- 참고로 Tweet 데이터는 아래 책에 제공하는 데이타를 data 폴더에 미리 다운로드 했음.
- Original Data Source
    - Douwe Osinga, Deep Learning Cookbook. Ch7, Suggesting Emojis

In [2]:
file_name = os.path.join('data', "emojis.csv")
def read_data(file_name):
    all_tweets = pd.read_csv(file_name, names=['content', 'sentiment'])
    return all_tweets

raw_tweets = read_data(file_name)
print("all_tweets shape: ", raw_tweets.shape)
raw_tweets.head()

all_tweets shape:  (806204, 2)


Unnamed: 0,content,sentiment
0,text,emoji
1,@ATLHawks: Chance The Rapper or Kent Bazemore?...,🤔
2,"@nice_aju: Yup we love you, you're so precious...",💙
3,Fav Sing Me to Sleep by Alan Walker,💛
4,@AshBenzo: Wife From The Real-Life 'Fault In O...,💔


## Tweet Data Cleaning

중복 데이터 제거를 합니다.

In [3]:
def drop_duplicate_records(all_tweets):
    all_tweets_cleaned_df = all_tweets.drop_duplicates() 
    return all_tweets_cleaned_df

tweet_temp_df = drop_duplicate_records(raw_tweets)
print("Unique Tweets : \n {} ".format(tweet_temp_df.shape))
num_tweets = tweet_temp_df.shape[0]

Unique Tweets : 
 (613060, 2) 


In [4]:
all_tweets_cleaned_df = tweet_temp_df

## Tweet Data Shape
- 데이터는 Content 와 Sentiment의 두개의 컬럼으로 구성

In [5]:
all_tweets_cleaned_df.head()

Unnamed: 0,content,sentiment
0,text,emoji
1,@ATLHawks: Chance The Rapper or Kent Bazemore?...,🤔
2,"@nice_aju: Yup we love you, you're so precious...",💙
3,Fav Sing Me to Sleep by Alan Walker,💛
4,@AshBenzo: Wife From The Real-Life 'Fault In O...,💔


In [6]:
# show emoticon stat
num_emoticons = all_tweets_cleaned_df['sentiment'].nunique()
print("The number of unique emoticon: {}".format(num_emoticons)) # The number of unique emoticon: 990
print("Top 10 emoticons: ")
all_tweets_cleaned_df['sentiment'].value_counts()[0:10]

The number of unique emoticon: 990
Top 10 emoticons: 


😂    91590
❤    32899
😭    27890
😍    25531
🙄    16406
😊    14880
😩    13905
🤔    13488
💕    10006
🔥     9419
Name: sentiment, dtype: int64

## Tweet 의 단어 갯수
Count the number of words per token

In [7]:
import pandas as pd

def num_words_sentence(sentence):
    tokens = sentence.split()
    len_tokens = len(tokens)
    return len_tokens
    
def store_num_words(texts):    
    num_tokens= list()
    for text in texts:
        len_tokens = num_words_sentence(text)
        num_tokens.append(len_tokens)
                     
    return num_tokens
        
num_tokens_data = store_num_words(all_tweets_cleaned_df['content'])    
                   
df_num_tokens = pd.DataFrame(num_tokens_data, columns=['num_tokens'])    

print("Statistics of the number of words per tweet: ")
df_num_tokens.describe()


Statistics of the number of words per tweet: 


Unnamed: 0,num_tokens
count,613060.0
mean,9.789479
std,6.070048
min,1.0
25%,5.0
50%,8.0
75%,13.0
max,54.0


<h2>이모티콘 레이블을 10개로 한정 </h2>
<p>이모티콘의 레이블을 10개로 한정하기 위해, Top10 을 제외한 모든 데이타는 삭제 함.. <br>

In [8]:
num_threshold_lables = 9300 # process more than 9000 for lables
num_use_records = num_tweets # total of tweets

def filter_less_label(all_tweets, num_use_records,   num_filter_less_label):
    # Desc: filter the total data for being less label with specific number
    use_tweets = all_tweets[0:num_use_records]
    tweets = use_tweets.groupby('sentiment').filter(lambda c:len(c) > num_filter_less_label)

    return tweets

tweets = filter_less_label(all_tweets_cleaned_df, num_use_records,   num_threshold_lables)
tweets.sentiment.value_counts()

😂    91590
❤    32899
😭    27890
😍    25531
🙄    16406
😊    14880
😩    13905
🤔    13488
💕    10006
🔥     9419
Name: sentiment, dtype: int64

In [9]:
from sklearn.utils import resample

print(tweets.sentiment.unique())
emoji_0_df = tweets.query("sentiment == '🤔'")
emoji_1_df = tweets.query("sentiment == '❤'")
emoji_2_df = tweets.query("sentiment == '😭'")
emoji_3_df = tweets.query("sentiment == '😂'")
emoji_4_df = tweets.query("sentiment == '🙄'")

emoji_5_df = tweets.query("sentiment == '😍'")
emoji_6_df = tweets.query("sentiment == '💕'")
emoji_7_df = tweets.query("sentiment == '😊'")
emoji_8_df = tweets.query("sentiment == '😩'")
emoji_9_df = tweets.query("sentiment == '🔥'")

minority_count = min(emoji_0_df.shape[0],
                     emoji_1_df.shape[0],
                     emoji_2_df.shape[0],
                     emoji_3_df.shape[0],
                     emoji_4_df.shape[0],
                     emoji_5_df.shape[0],
                     emoji_6_df.shape[0],
                     emoji_7_df.shape[0],
                     emoji_8_df.shape[0],
                     emoji_9_df.shape[0],                     
                    )
minority_count

emoji_0_df = resample(emoji_0_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)

emoji_1_df = resample(emoji_1_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)


emoji_2_df = resample(emoji_2_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)

emoji_3_df = resample(emoji_3_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)

emoji_4_df = resample(emoji_4_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)

emoji_5_df = resample(emoji_5_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)

emoji_6_df = resample(emoji_6_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)

emoji_7_df = resample(emoji_7_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)

emoji_8_df = resample(emoji_8_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)

emoji_9_df = resample(emoji_9_df,
                        replace = False,
                        n_samples = minority_count,
                        random_state = 27)

df_balanced = pd.concat([
    emoji_0_df, emoji_1_df,
    emoji_2_df, emoji_3_df,
    emoji_4_df, emoji_5_df,
    emoji_6_df, emoji_7_df,
    emoji_8_df, emoji_9_df,    
])
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

print("Shape: ", df_balanced.shape)
print(df_balanced.sentiment.value_counts())

['🤔' '❤' '😭' '😂' '🙄' '😍' '💕' '😊' '😩' '🔥']
Shape:  (94190, 2)
😭    9419
😂    9419
😊    9419
❤    9419
😍    9419
😩    9419
🙄    9419
💕    9419
🤔    9419
🔥    9419
Name: sentiment, dtype: int64


## Emoticon to index 의 사전 생성

In [10]:
from TweetData import TweetData # Custom class for handling input data
tweet_data = TweetData(df_balanced)

In [11]:
# make emoji_to_idx and save it into data
emoji_to_idx = tweet_data.make_sentimet_label()

data/emoji_to_idx.pickle is saved
{'❤': 0, '💕': 1, '🔥': 2, '😂': 3, '😊': 4, '😍': 5, '😩': 6, '😭': 7, '🙄': 8, '🤔': 9}


## Tweet 데이터를 Train text, Train label, Test text, Test label으로 분리

In [12]:
texts, labels = tweet_data.make_texts_lables()
print("text: {}, label: {}: \n".format(texts[0:5], labels[0:5]))

text: [' cake en snapchat', " nobody's business partners", " don't get me started", ' the ice cream man did him dirty', 'gonna miss a snow club party'], label: [0, 5, 8, 6, 6]: 



In [13]:
df = pd.DataFrame(data={'content':texts, 'label':labels})

In [14]:
df.label

0        0
1        5
2        8
3        6
4        6
        ..
94185    6
94186    0
94187    4
94188    8
94189    6
Name: label, Length: 94190, dtype: int64

Train 용으로 2개의 파일을 두개를 만들고 (전체 대비 0.45, 0.45), 1개의 테스트 파일(0.1)을 생성 함

In [15]:
train_text, train_label, test_text, test_label = tweet_data.split_train_test_data(texts, labels, 0.9)
train_text_01, train_label_01, train_text_02, train_label_02 = tweet_data.split_train_test_data(train_text, train_label, 0.5)


In [16]:
DATA_COLUMN = 'TWEET'
LABEL_COLUMN = 'LABEL'

tweet_file_01_df = pd.DataFrame({DATA_COLUMN: train_text_01,LABEL_COLUMN:train_label_01})
tweet_file_02_df = pd.DataFrame({DATA_COLUMN: train_text_02,LABEL_COLUMN:train_label_02})
tweet_file_test_df = pd.DataFrame({DATA_COLUMN: test_text,LABEL_COLUMN:test_label})

In [17]:
print(tweet_file_01_df.info())
print(tweet_file_02_df.info())
print(tweet_file_test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42385 entries, 0 to 42384
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   TWEET   42385 non-null  object
 1   LABEL   42385 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 662.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42386 entries, 0 to 42385
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   TWEET   42386 non-null  object
 1   LABEL   42386 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 662.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9419 entries, 0 to 9418
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   TWEET   9419 non-null   object
 1   LABEL   9419 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 147.3+ KB
None


- 2개의 Train 파일을 data/split/ 에 저장
- 1개의 Test파일을 data/test/에 저장

In [18]:
save_split_data_dir = 'data/split'
save_test_data_dir = 'data/test'
tweet_file_01_file = "tweet_file_01.csv"
tweet_file_02_file = "tweet_file_02.csv"
tweet_file_test_file = "tweet_file_test.csv"
os.makedirs(save_split_data_dir, exist_ok=True)
os.makedirs(save_test_data_dir, exist_ok=True)
tweet_data.save_input_data(save_split_data_dir, tweet_file_01_file , tweet_file_01_df)
tweet_data.save_input_data(save_split_data_dir, tweet_file_02_file , tweet_file_02_df)
tweet_data.save_input_data(save_test_data_dir, tweet_file_test_file , tweet_file_test_df)


data/split/tweet_file_01.csv is saved
data/split/tweet_file_02.csv is saved
data/test/tweet_file_test.csv is saved


In [19]:
tweet_file_01_file_path = os.path.join(save_split_data_dir, tweet_file_01_file)
tweet_file_02_file_path = os.path.join(save_split_data_dir, tweet_file_02_file)
print("tweet_file_01_file_path: ", tweet_file_01_file_path)
print("tweet_file_02_file_path: ", tweet_file_02_file_path)

tweet_file_01_file_path:  data/split/tweet_file_01.csv
tweet_file_02_file_path:  data/split/tweet_file_02.csv


2개의 Train 파일을 gzip으로 압축함

In [20]:
! gzip -f {tweet_file_01_file_path}
! gzip -f {tweet_file_02_file_path}

## 로컬에 저장된 2개의 gzip Train 파일을 S3에 저장 함.

In [21]:
import boto3
import sagemaker

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

In [22]:
s3_destination_path_csv = 's3://{}/tweet_emoticon/csv'.format(bucket)
print(s3_destination_path_csv)
!aws s3 cp  $save_split_data_dir $s3_destination_path_csv/ --recursive

s3://sagemaker-ap-northeast-2-343441690612/tweet_emoticon/csv
upload: data/split/tweet_file_02.csv.gz to s3://sagemaker-ap-northeast-2-343441690612/tweet_emoticon/csv/tweet_file_02.csv.gz
upload: data/split/tweet_file_01.csv.gz to s3://sagemaker-ap-northeast-2-343441690612/tweet_emoticon/csv/tweet_file_01.csv.gz


In [23]:
! aws s3 ls {s3_destination_path_csv}/

2020-08-16 00:18:13     839083 tweet_file_01.csv.gz
2020-08-16 00:18:13     839509 tweet_file_02.csv.gz


In [24]:
%store s3_destination_path_csv
%store save_split_data_dir

Stored 's3_destination_path_csv' (str)
Stored 'save_split_data_dir' (str)
