# Tweet input processing: Download, look into, and store to S3

In [1]:
import os
os.getcwd()

'/home/ec2-user/SageMaker/RecommendEmoticon/Tweet-BERT'

In [2]:
import pandas as pd
import re
import boto3
import numpy as np
import os

from TweetInput import TweetInput

<h2> Download tweet data </h2>
<p>Location: https://201912-mbp-gsmoon.s3.us-east-2.amazonaws.com/emoji_data/emojis.csv

In [3]:
# Create TweetInput object    
tweet_input_obj = TweetInput()

# Load data from S3
tweet_input_obj.download_data_from_s3()

# Read all tweets
all_tweets = tweet_input_obj.read_data()    
all_tweets.head()   
print("all_tweets shape: ", all_tweets.shape)
print(type(all_tweets))

Downloading from S3 is done in data/emojis.csv
Loading data is done!
all_tweets shape:  (806204, 2)
<class 'pandas.core.frame.DataFrame'>


<h2>Look into tweet data

In [4]:
all_tweets.head(15)

Unnamed: 0,content,sentiment
0,text,emoji
1,@ATLHawks: Chance The Rapper or Kent Bazemore?...,🤔
2,"@nice_aju: Yup we love you, you're so precious...",💙
3,Fav Sing Me to Sleep by Alan Walker,💛
4,@AshBenzo: Wife From The Real-Life 'Fault In O...,💔
5,Why am I up so late,😔
6,Puppy,🙆
7,@AKGirlKuku: I need a hug. No No! I need yo hu...,❤
8,@wearegoad: me with my pet,🐕
9,Let's have some fun 8==D O Find me here,😘


In [5]:
# show emoticon stat
all_tweets_df = pd.DataFrame(all_tweets)
all_tweets_cleaned_df = all_tweets_df
all_tweets_cleaned_df.drop_duplicates(inplace = True) 
all_tweets_cleaned_df['content'].nunique()

600223

In [6]:
# show emoticon stat
num_emoticons = all_tweets_cleaned_df['sentiment'].nunique()
print("The number of unique emoticon: {}".format(num_emoticons)) # The number of unique emoticon: 990
print("Top 20 emoticons: ")
all_tweets_cleaned_df['sentiment'].value_counts()[0:10]

The number of unique emoticon: 990
Top 20 emoticons: 


😂    91590
❤    32899
😭    27890
😍    25531
🙄    16406
😊    14880
😩    13905
🤔    13488
💕    10006
🔥     9419
Name: sentiment, dtype: int64

<h2> Count the number of words per token

In [7]:
import pandas as pd

def num_words_sentence(sentence):
    tokens = sentence.split()
    len_tokens = len(tokens)
    return len_tokens
    
def store_num_words(texts):    
    num_tokens= list()
    for text in texts:
        len_tokens = num_words_sentence(text)
        num_tokens.append(len_tokens)
                     
    return num_tokens
        
num_tokens_data = store_num_words(all_tweets_cleaned_df['content'])    
                   
df_num_tokens = pd.DataFrame(num_tokens_data, columns=['num_tokens'])    
#print(num_tokens_data)
#df_num_tokens.num_tokens.describe()
print("Statistics of the number of words per tweet: ")
df_num_tokens.describe()


Statistics of the number of words per tweet: 


Unnamed: 0,num_tokens
count,613060.0
mean,9.789479
std,6.070048
min,1.0
25%,5.0
50%,8.0
75%,13.0
max,54.0


<h2>Parameters</h2>
<p> 

In [8]:
num_threshold_lables = 8500 # process more than 11800 for lables
num_use_records = 557832 # total of tweets

<h2>Make Two files </h2>
<p>이모티콘의 레이블을 10개로 한정함. <br>

In [9]:

# Load a filtered tweets
tweets = tweet_input_obj.filter_less_label(all_tweets_cleaned_df,num_use_records, num_threshold_lables)

print("tweets filterd shape: ", tweets.shape)
print(tweets.nunique())

# make emoji_to_idx and save it into data
tweet_input_obj.make_sentimet_label()
texts, labels = tweet_input_obj.make_texts_lables()
train_text, train_label, test_text, test_label = tweet_input_obj.split_train_test_data(texts, labels, 0.5)



tweets filterd shape:  (232077, 2)
content      230688
sentiment        10
dtype: int64
data/emoji_to_idx.pickle is saved


## Make two input files

In [10]:
DATA_COLUMN = 'TWEET'
LABEL_COLUMN = 'LABEL'

tweet_file_01_df = pd.DataFrame({DATA_COLUMN: train_text,LABEL_COLUMN:train_label})
tweet_file_02_df = pd.DataFrame({DATA_COLUMN: test_text,LABEL_COLUMN:test_label})

In [11]:
print(tweet_file_01_df.info())
print(tweet_file_02_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116038 entries, 0 to 116037
Data columns (total 2 columns):
TWEET    116038 non-null object
LABEL    116038 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116039 entries, 0 to 116038
Data columns (total 2 columns):
TWEET    116039 non-null object
LABEL    116039 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.8+ MB
None


In [12]:
save_split_data_dir = 'data/split'
tweet_file_01_file = "tweet_file_01.csv"
tweet_file_02_file = "tweet_file_02.csv"
tweet_input_obj.save_input_data(save_split_data_dir, tweet_file_01_file , tweet_file_01_df)
tweet_input_obj.save_input_data(save_split_data_dir, tweet_file_02_file , tweet_file_02_df)


data/split/tweet_file_01.csv is saved
data/split/tweet_file_02.csv is saved


In [13]:
tweet_file_01_file_path = os.path.join(save_split_data_dir, tweet_file_01_file)
tweet_file_02_file_path = os.path.join(save_split_data_dir, tweet_file_02_file)
print("tweet_file_01_file_path: ", tweet_file_01_file_path)
print("tweet_file_02_file_path: ", tweet_file_02_file_path)

tweet_file_01_file_path:  data/split/tweet_file_01.csv
tweet_file_02_file_path:  data/split/tweet_file_02.csv


In [14]:
! gzip -f {tweet_file_01_file_path}
! gzip -f {tweet_file_02_file_path}

# Upload dataset to a Private S3 Bucket in our Account

In [15]:
import boto3
import sagemaker

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

In [16]:
s3_destination_path_csv = 's3://{}/tweet_emoticon/csv'.format(bucket)
print(s3_destination_path_csv)
!aws s3 cp  $save_split_data_dir $s3_destination_path_csv/ --recursive
# !aws s3 cp  $tweet_file_02_file_path $s3_destination_path_csv/ 

s3://sagemaker-us-west-2-057716757052/tweet_emoticon/csv
upload: data/split/tweet_file_02.csv.gz to s3://sagemaker-us-west-2-057716757052/tweet_emoticon/csv/tweet_file_02.csv.gz
upload: data/split/tweet_file_01.csv.gz to s3://sagemaker-us-west-2-057716757052/tweet_emoticon/csv/tweet_file_01.csv.gz


In [17]:
! aws s3 ls {s3_destination_path_csv}/

2020-06-28 05:07:16    3008478 tweet_file_01.csv.gz
2020-06-28 05:07:16    3026906 tweet_file_02.csv.gz


In [18]:
%store s3_destination_path_csv
%store save_split_data_dir

Stored 's3_destination_path_csv' (str)
Stored 'save_split_data_dir' (str)
