# Processing TikTok Post Data

In [None]:
! pip install pandas
! pip install matplotlib

In [1]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import ast
import numpy as np
from PIL import Image
import re

In [28]:
df = pd.read_csv('users_vids_short.csv')
# df = pd.read_csv('sug_users_vids1.csv')
# df = pd.read_csv('sug_users_vids_all.csv')

In [29]:
df.head()

Unnamed: 0,id,create_time,user_name,hashtags,song,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids
0,6892428462015958273,1604768557,john.cena10,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",الصوت الأصلي,8,1984,3,18,12800,1000000,4700000,211
1,6891790235336822018,1604619960,john.cena10,"['johncena', 'love', 'tiktok', 'halloween', 'q...",الصوت الأصلي,6,7372,9,51,52800,1000000,4700000,211
2,6891264678832475393,1604497592,john.cena10,"['johncena', 'love', 'fyp', 'foryoupage']",The Time Is Now (John Cena),5,4623,11,27,37700,1000000,4700000,211
3,6891050048403049730,1604447622,john.cena10,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",الصوت الأصلي,6,7931,6,24,51200,1000000,4700000,211
4,6890886086613126402,1604409445,john.cena10,"['johncena', 'foryoupage', 'fyp', 'viral']",الصوت الأصلي,15,3229,9,14,24700,1000000,4700000,211


#### Remove Unnecessary Data 

In [30]:
df.drop(['id', 'user_name', 'song'], axis=1, inplace=True)
# df.drop('user_name', axis=1, inplace=True)
# df.drop('song', axis=1, inplace=True)
# df.drop('create_time', axis=1, inplace=True)
df.head()

Unnamed: 0,create_time,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids
0,1604768557,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",8,1984,3,18,12800,1000000,4700000,211
1,1604619960,"['johncena', 'love', 'tiktok', 'halloween', 'q...",6,7372,9,51,52800,1000000,4700000,211
2,1604497592,"['johncena', 'love', 'fyp', 'foryoupage']",5,4623,11,27,37700,1000000,4700000,211
3,1604447622,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",6,7931,6,24,51200,1000000,4700000,211
4,1604409445,"['johncena', 'foryoupage', 'fyp', 'viral']",15,3229,9,14,24700,1000000,4700000,211


### Processing Time

In [31]:
df['datetime'] = pd.to_datetime(df['create_time'], unit='s')
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['weekday'] = df['datetime'].dt.weekday
df['hour'] = df['datetime'].dt.hour
df.head()

Unnamed: 0,create_time,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,datetime,year,month,day,weekday,hour
0,1604768557,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",8,1984,3,18,12800,1000000,4700000,211,2020-11-07 17:02:37,2020,11,7,5,17
1,1604619960,"['johncena', 'love', 'tiktok', 'halloween', 'q...",6,7372,9,51,52800,1000000,4700000,211,2020-11-05 23:46:00,2020,11,5,3,23
2,1604497592,"['johncena', 'love', 'fyp', 'foryoupage']",5,4623,11,27,37700,1000000,4700000,211,2020-11-04 13:46:32,2020,11,4,2,13
3,1604447622,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",6,7931,6,24,51200,1000000,4700000,211,2020-11-03 23:53:42,2020,11,3,1,23
4,1604409445,"['johncena', 'foryoupage', 'fyp', 'viral']",15,3229,9,14,24700,1000000,4700000,211,2020-11-03 13:17:25,2020,11,3,1,13


##### Cyclical Encoding

In [32]:
#assume each month is approx 31 days
df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)

df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

df['weekday_sin'] = np.sin(2 * np.pi * df['weekday']/7)
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday']/7)

df.head()

Unnamed: 0,create_time,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,...,weekday,hour,day_sin,day_cos,month_sin,month_cos,hour_sin,hour_cos,weekday_sin,weekday_cos
0,1604768557,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",8,1984,3,18,12800,1000000,4700000,211,...,5,17,0.988468,0.151428,-0.5,0.866025,-0.965926,-0.258819,-0.974928,-0.222521
1,1604619960,"['johncena', 'love', 'tiktok', 'halloween', 'q...",6,7372,9,51,52800,1000000,4700000,211,...,3,23,0.848644,0.528964,-0.5,0.866025,-0.258819,0.965926,0.433884,-0.900969
2,1604497592,"['johncena', 'love', 'fyp', 'foryoupage']",5,4623,11,27,37700,1000000,4700000,211,...,2,13,0.724793,0.688967,-0.5,0.866025,-0.258819,-0.965926,0.974928,-0.222521
3,1604447622,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",6,7931,6,24,51200,1000000,4700000,211,...,1,23,0.571268,0.820763,-0.5,0.866025,-0.258819,0.965926,0.781831,0.62349
4,1604409445,"['johncena', 'foryoupage', 'fyp', 'viral']",15,3229,9,14,24700,1000000,4700000,211,...,1,13,0.571268,0.820763,-0.5,0.866025,-0.258819,-0.965926,0.781831,0.62349


##### Removing Unnecessary Time Data

In [33]:
df.drop(['weekday', 'hour', 'month', 'day', 'datetime', 'create_time'], axis=1, inplace=True)
df.head()

Unnamed: 0,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,year,day_sin,day_cos,month_sin,month_cos,hour_sin,hour_cos,weekday_sin,weekday_cos
0,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",8,1984,3,18,12800,1000000,4700000,211,2020,0.988468,0.151428,-0.5,0.866025,-0.965926,-0.258819,-0.974928,-0.222521
1,"['johncena', 'love', 'tiktok', 'halloween', 'q...",6,7372,9,51,52800,1000000,4700000,211,2020,0.848644,0.528964,-0.5,0.866025,-0.258819,0.965926,0.433884,-0.900969
2,"['johncena', 'love', 'fyp', 'foryoupage']",5,4623,11,27,37700,1000000,4700000,211,2020,0.724793,0.688967,-0.5,0.866025,-0.258819,-0.965926,0.974928,-0.222521
3,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",6,7931,6,24,51200,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,0.965926,0.781831,0.62349
4,"['johncena', 'foryoupage', 'fyp', 'viral']",15,3229,9,14,24700,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,-0.965926,0.781831,0.62349


### Processing Hashtags

#### Handle NaN Data

In [26]:
# Convert hashtags to string 
df['hashtags'] = df['hashtags'].apply(ast.literal_eval)
df['hashtags'] = df['hashtags'].apply(lambda x: x if x != [] else ["NoHashtag"])

print(df)

    create_time    user_name  \
0    1604768557  john.cena10   
1    1604619960  john.cena10   
2    1604497592  john.cena10   
3    1604447622  john.cena10   
4    1604409445  john.cena10   
..          ...          ...   
95   1596312708      mrbeast   
96   1596056570      mrbeast   
97   1595791388      mrbeast   
98   1595353928      mrbeast   
99   1595181506      mrbeast   

                                             hashtags  \
0    [johncena, love, tiktok, fyp, foryoupage, vibes]   
1   [johncena, love, tiktok, halloween, queen, rob...   
2                   [johncena, love, fyp, foryoupage]   
3   [johncena, fyp, foryoupage, viral, comedy, cha...   
4                  [johncena, foryoupage, fyp, viral]   
..                                                ...   
95                                        [NoHashtag]   
96                                        [NoHashtag]   
97                                        [NoHashtag]   
98                                        [No

In [27]:
explodedDf = df['hashtags'].explode()

#### Remove Special Characters and Whitespace

In [28]:
def removeSpecialCharacters(hashtags):
    processedHashtag = [re.sub(r'[^a-zA-Z0-9]', '', hashtag) for hashtag in hashtags]
    return processedHashtag

In [29]:
df['hashtags'] = df['hashtags'].apply(removeSpecialCharacters)
df.head()

Unnamed: 0,create_time,user_name,hashtags,song,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids
0,1604768557,john.cena10,"[johncena, love, tiktok, fyp, foryoupage, vibes]",الصوت الأصلي,8,1984,3,18,12800,1000000,4700000,211
1,1604619960,john.cena10,"[johncena, love, tiktok, halloween, queen, rob...",الصوت الأصلي,6,7372,9,51,52800,1000000,4700000,211
2,1604497592,john.cena10,"[johncena, love, fyp, foryoupage]",The Time Is Now (John Cena),5,4623,11,27,37700,1000000,4700000,211
3,1604447622,john.cena10,"[johncena, fyp, foryoupage, viral, comedy, cha...",الصوت الأصلي,6,7931,6,24,51200,1000000,4700000,211
4,1604409445,john.cena10,"[johncena, foryoupage, fyp, viral]",الصوت الأصلي,15,3229,9,14,24700,1000000,4700000,211


#### Handling Similar Hashtags

#### 

#### Word to Vec