# Processing TikTok Post Data

In [21]:
# ! pip install pandas
# ! pip install matplotlib
# ! pip install gensim

In [22]:
# ! pip3 install wordsegment
# import sys
# print(sys.executable)
# !/usr/local/bin/python3 -m pip install wordsegment 
# !/usr/local/bin/python3 -m pip install gensim

In [23]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import ast
import numpy as np
from PIL import Image
import re
from wordsegment import load, segment
from gensim.models import Word2Vec


In [24]:
df = pd.read_csv('users_vids_short.csv')
# df = pd.read_csv('sug_users_vids1.csv')
# df = pd.read_csv('sug_users_vids_all.csv')

In [25]:
df.head()

Unnamed: 0,id,create_time,user_name,hashtags,song,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids
0,6892428462015958273,1604768557,john.cena10,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",الصوت الأصلي,8,1984,3,18,12800,1000000,4700000,211
1,6891790235336822018,1604619960,john.cena10,"['johncena', 'love', 'tiktok', 'halloween', 'q...",الصوت الأصلي,6,7372,9,51,52800,1000000,4700000,211
2,6891264678832475393,1604497592,john.cena10,"['johncena', 'love', 'fyp', 'foryoupage']",The Time Is Now (John Cena),5,4623,11,27,37700,1000000,4700000,211
3,6891050048403049730,1604447622,john.cena10,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",الصوت الأصلي,6,7931,6,24,51200,1000000,4700000,211
4,6890886086613126402,1604409445,john.cena10,"['johncena', 'foryoupage', 'fyp', 'viral']",الصوت الأصلي,15,3229,9,14,24700,1000000,4700000,211


#### Remove Unnecessary Data 

In [26]:
df.drop(['id', 'user_name', 'song'], axis=1, inplace=True)
df.head()

Unnamed: 0,create_time,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids
0,1604768557,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",8,1984,3,18,12800,1000000,4700000,211
1,1604619960,"['johncena', 'love', 'tiktok', 'halloween', 'q...",6,7372,9,51,52800,1000000,4700000,211
2,1604497592,"['johncena', 'love', 'fyp', 'foryoupage']",5,4623,11,27,37700,1000000,4700000,211
3,1604447622,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",6,7931,6,24,51200,1000000,4700000,211
4,1604409445,"['johncena', 'foryoupage', 'fyp', 'viral']",15,3229,9,14,24700,1000000,4700000,211


### Processing Time

In [27]:
df['datetime'] = pd.to_datetime(df['create_time'], unit='s')
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['weekday'] = df['datetime'].dt.weekday
df['hour'] = df['datetime'].dt.hour
df.head()

Unnamed: 0,create_time,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,datetime,year,month,day,weekday,hour
0,1604768557,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",8,1984,3,18,12800,1000000,4700000,211,2020-11-07 17:02:37,2020,11,7,5,17
1,1604619960,"['johncena', 'love', 'tiktok', 'halloween', 'q...",6,7372,9,51,52800,1000000,4700000,211,2020-11-05 23:46:00,2020,11,5,3,23
2,1604497592,"['johncena', 'love', 'fyp', 'foryoupage']",5,4623,11,27,37700,1000000,4700000,211,2020-11-04 13:46:32,2020,11,4,2,13
3,1604447622,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",6,7931,6,24,51200,1000000,4700000,211,2020-11-03 23:53:42,2020,11,3,1,23
4,1604409445,"['johncena', 'foryoupage', 'fyp', 'viral']",15,3229,9,14,24700,1000000,4700000,211,2020-11-03 13:17:25,2020,11,3,1,13


##### Cyclical Encoding

In [28]:
#assume each month is approx 31 days
df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)

df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

df['weekday_sin'] = np.sin(2 * np.pi * df['weekday']/7)
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday']/7)

df.head()

Unnamed: 0,create_time,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,...,weekday,hour,day_sin,day_cos,month_sin,month_cos,hour_sin,hour_cos,weekday_sin,weekday_cos
0,1604768557,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",8,1984,3,18,12800,1000000,4700000,211,...,5,17,0.988468,0.151428,-0.5,0.866025,-0.965926,-0.258819,-0.974928,-0.222521
1,1604619960,"['johncena', 'love', 'tiktok', 'halloween', 'q...",6,7372,9,51,52800,1000000,4700000,211,...,3,23,0.848644,0.528964,-0.5,0.866025,-0.258819,0.965926,0.433884,-0.900969
2,1604497592,"['johncena', 'love', 'fyp', 'foryoupage']",5,4623,11,27,37700,1000000,4700000,211,...,2,13,0.724793,0.688967,-0.5,0.866025,-0.258819,-0.965926,0.974928,-0.222521
3,1604447622,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",6,7931,6,24,51200,1000000,4700000,211,...,1,23,0.571268,0.820763,-0.5,0.866025,-0.258819,0.965926,0.781831,0.62349
4,1604409445,"['johncena', 'foryoupage', 'fyp', 'viral']",15,3229,9,14,24700,1000000,4700000,211,...,1,13,0.571268,0.820763,-0.5,0.866025,-0.258819,-0.965926,0.781831,0.62349


##### Removing Unnecessary Time Data

In [29]:
df.drop(['weekday', 'hour', 'month', 'day', 'datetime', 'create_time'], axis=1, inplace=True)
df.head()

Unnamed: 0,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,year,day_sin,day_cos,month_sin,month_cos,hour_sin,hour_cos,weekday_sin,weekday_cos
0,"['johncena', 'love', 'tiktok', 'fyp', 'foryoup...",8,1984,3,18,12800,1000000,4700000,211,2020,0.988468,0.151428,-0.5,0.866025,-0.965926,-0.258819,-0.974928,-0.222521
1,"['johncena', 'love', 'tiktok', 'halloween', 'q...",6,7372,9,51,52800,1000000,4700000,211,2020,0.848644,0.528964,-0.5,0.866025,-0.258819,0.965926,0.433884,-0.900969
2,"['johncena', 'love', 'fyp', 'foryoupage']",5,4623,11,27,37700,1000000,4700000,211,2020,0.724793,0.688967,-0.5,0.866025,-0.258819,-0.965926,0.974928,-0.222521
3,"['johncena', 'fyp', 'foryoupage', 'viral', 'co...",6,7931,6,24,51200,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,0.965926,0.781831,0.62349
4,"['johncena', 'foryoupage', 'fyp', 'viral']",15,3229,9,14,24700,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,-0.965926,0.781831,0.62349


### Processing Hashtags

#### Handle NaN Data

In [30]:
# Convert hashtags to string 

df['hashtags'] = df['hashtags'].apply(ast.literal_eval)
df['hashtags'] = df['hashtags'].apply(lambda x: x if x != [] else ["nohashtag"])

def processHashtags(hashtags):
    for i in range(len(hashtags)):
        if not hashtags[i].strip(): 
            hashtags[i] = 'nohashtag'
    return hashtags

df['hashtags'] = df['hashtags'].apply(processHashtags)

df.head()

Unnamed: 0,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,year,day_sin,day_cos,month_sin,month_cos,hour_sin,hour_cos,weekday_sin,weekday_cos
0,"[johncena, love, tiktok, fyp, foryoupage, vibes]",8,1984,3,18,12800,1000000,4700000,211,2020,0.988468,0.151428,-0.5,0.866025,-0.965926,-0.258819,-0.974928,-0.222521
1,"[johncena, love, tiktok, halloween, queen, rob...",6,7372,9,51,52800,1000000,4700000,211,2020,0.848644,0.528964,-0.5,0.866025,-0.258819,0.965926,0.433884,-0.900969
2,"[johncena, love, fyp, foryoupage]",5,4623,11,27,37700,1000000,4700000,211,2020,0.724793,0.688967,-0.5,0.866025,-0.258819,-0.965926,0.974928,-0.222521
3,"[johncena, fyp, foryoupage, viral, comedy, cha...",6,7931,6,24,51200,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,0.965926,0.781831,0.62349
4,"[johncena, foryoupage, fyp, viral]",15,3229,9,14,24700,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,-0.965926,0.781831,0.62349


In [31]:
explodedDf = df['hashtags'].explode()

#### Remove Special Characters

In [32]:
def removeSpecialCharacters(hashtags):
    processedHashtag = [re.sub(r'[^a-zA-Z0-9]', '', hashtag) for hashtag in hashtags]
    processedHashtag = [hashtag.lower() for hashtag in processedHashtag]
    return processedHashtag

In [33]:
df['hashtags'] = df['hashtags'].apply(removeSpecialCharacters)
df.head()

Unnamed: 0,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,year,day_sin,day_cos,month_sin,month_cos,hour_sin,hour_cos,weekday_sin,weekday_cos
0,"[johncena, love, tiktok, fyp, foryoupage, vibes]",8,1984,3,18,12800,1000000,4700000,211,2020,0.988468,0.151428,-0.5,0.866025,-0.965926,-0.258819,-0.974928,-0.222521
1,"[johncena, love, tiktok, halloween, queen, rob...",6,7372,9,51,52800,1000000,4700000,211,2020,0.848644,0.528964,-0.5,0.866025,-0.258819,0.965926,0.433884,-0.900969
2,"[johncena, love, fyp, foryoupage]",5,4623,11,27,37700,1000000,4700000,211,2020,0.724793,0.688967,-0.5,0.866025,-0.258819,-0.965926,0.974928,-0.222521
3,"[johncena, fyp, foryoupage, viral, comedy, cha...",6,7931,6,24,51200,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,0.965926,0.781831,0.62349
4,"[johncena, foryoupage, fyp, viral]",15,3229,9,14,24700,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,-0.965926,0.781831,0.62349


In [34]:
explodedDf.value_counts()

nohashtag              56
fyp                    26
foryoupage             24
johncena               24
viral                  19
wwe                    11
comedy                 11
tiktok                  7
duet                    4
john_cena               4
viral_video             4
love                    3
foryou                  3
avengers                2
🖐🏻                      2
ad                      2
pubg                    2
jungkook                2
stark                   2
vibes                   2
rdj                     2
surrender               2
antidote                1
foru                    1
HandShaq                1
GoFORTO                 1
bts                     1
TeamSHAQ                1
😂😂😂                     1
ohnanana                1
retirementdance         1
players                 1
i                       1
ChasingHappiness        1
spiderman               1
tiptoechallenge         1
misterstark             1
delayed                 1
xbox        

#### Handling Similar Hashtags

In [35]:
def processSimilarHashtags(hashtags):
    similarHashtags = ['fypforyoupage', 'fyp', 'foryou', 'foru', 'foruyou', 'fouryou', 'foryourpage', 'foryouu', 'foryoupag', '4you', 'fypforyou', 'foryoupagee', 'fouryoupage', 'fypforyourpage', 'foreyou', 'foreyoupage', 'foryour', 'foyou', 'fouyoupage' 'foryourpag', '4youpage', 'fypforyoupa', 'foryoupqge', 'fouryoupag', 'foryoupge', 'foyoupage', 'foryouoage', 'foryoyoupage', 'foreyourpage', 'foryourpages', 'fy']
    
    for i in range(len(hashtags)):
        
        if hashtags[i] in similarHashtags:
            hashtags[i] = 'foryoupage'
        
        if not hashtags[i].strip():
            hashtags[i] = 'nohashtag'
    return hashtags

df['hashtags'] = df['hashtags'].apply(processSimilarHashtags)


In [36]:
explodedDf = df['hashtags'].explode()
counts = explodedDf.value_counts()
counts.to_csv('hashtag_counts.csv')
print(counts)

nohashtag              59
foryoupage             54
johncena               28
viral                  19
comedy                 11
wwe                    11
tiktok                  7
viralvideo              4
duet                    4
love                    3
stark                   2
ad                      2
rdj                     2
players                 2
surrender               2
pubg                    2
avengers                2
jungkook                2
vibes                   2
i                       1
goforto                 1
robertdowneyjr          1
goldbond                1
1m                      1
ohnanana                1
tiptoechallenge         1
quarantinemoustache     1
bts                     1
retirementdance         1
handshaq                1
antidote                1
spiderman               1
xbox                    1
teamshaq                1
fortopartner            1
roblox                  1
water                   1
summer                  1
robertdowney

#### Split Words in Hashtag

In [37]:
def splitHashtagByWords(hashtags):
    wordsToIgnore = ['tiktok']
    load()
    splitHashtags = []
    for hashtag in hashtags:
        if hashtag in wordsToIgnore:
             splitHashtags.append([hashtag])
        else:    
            splitHashtags.append(segment(hashtag))
    return splitHashtags

splitHashtags = df['hashtags'].apply(splitHashtagByWords)

processedHashtag = []
for hashtag in splitHashtags:
    for tag in hashtag:
        processedHashtag.append(tag)



df.head()

Unnamed: 0,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,year,day_sin,day_cos,month_sin,month_cos,hour_sin,hour_cos,weekday_sin,weekday_cos
0,"[johncena, love, tiktok, foryoupage, foryoupag...",8,1984,3,18,12800,1000000,4700000,211,2020,0.988468,0.151428,-0.5,0.866025,-0.965926,-0.258819,-0.974928,-0.222521
1,"[johncena, love, tiktok, halloween, queen, rob...",6,7372,9,51,52800,1000000,4700000,211,2020,0.848644,0.528964,-0.5,0.866025,-0.258819,0.965926,0.433884,-0.900969
2,"[johncena, love, foryoupage, foryoupage]",5,4623,11,27,37700,1000000,4700000,211,2020,0.724793,0.688967,-0.5,0.866025,-0.258819,-0.965926,0.974928,-0.222521
3,"[johncena, foryoupage, foryoupage, viral, come...",6,7931,6,24,51200,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,0.965926,0.781831,0.62349
4,"[johncena, foryoupage, foryoupage, viral]",15,3229,9,14,24700,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,-0.965926,0.781831,0.62349


#### Encode Hashtags using Word2Vec

In [38]:
model = Word2Vec(sentences=processedHashtag, vector_size=200, window=5, min_count=1, sg=1, workers=4)

model.save("hashtags_word2vec.model")

# Load the model
# model = Word2Vec.load("hashtags_word2vec.model")

# Get vector for a hashtag
# vector = model.wv['tiktok']  # Assuming the a word from a hashtag
# print(vector)

# If you need to encode a whole hashtag or sentence, average the vectors of the constituent words
def encode_hashtag(hashtag):
    # words = segment(hashtag.strip('#'))
    vector = sum(model.wv[word] for word in hashtag if word in model.wv) / len(hashtag)
    return vector

def encodeHashtag(hashtag):
    
    if any(word in model.wv.key_to_index for word in hashtag):
        vector = sum(model.wv[word] for word in hashtag if word in model.wv) / len(hashtag)
        return vector
    else:
        # Return a zero vector if none of the words are in the vocabulary
        return np.zeros(model.vector_size)

def encodeHashtagArray(hashtags):
    vectors = []
    for hashtag in hashtags:
        vector = encodeHashtag(hashtag)
        vectors.append(vector)
    
    if vectors:
        return np.mean(vectors, axis=0)
    else: 
        return np.zeros(model.vector_size)
    
df['vectors'] = df['hashtags'].apply(encodeHashtagArray)
df.head()

# encoded_vector = encode_hashtag('#DigitalMarketing')


Unnamed: 0,hashtags,video_length,n_likes,n_shares,n_comments,n_plays,n_followers,n_total_likes,n_total_vids,year,day_sin,day_cos,month_sin,month_cos,hour_sin,hour_cos,weekday_sin,weekday_cos,vectors
0,"[johncena, love, tiktok, foryoupage, foryoupag...",8,1984,3,18,12800,1000000,4700000,211,2020,0.988468,0.151428,-0.5,0.866025,-0.965926,-0.258819,-0.974928,-0.222521,"[-0.0003952271945308894, 0.0002000262296254125..."
1,"[johncena, love, tiktok, halloween, queen, rob...",6,7372,9,51,52800,1000000,4700000,211,2020,0.848644,0.528964,-0.5,0.866025,-0.258819,0.965926,0.433884,-0.900969,"[-0.0001609670725883916, 0.0001608782113180495..."
2,"[johncena, love, foryoupage, foryoupage]",5,4623,11,27,37700,1000000,4700000,211,2020,0.724793,0.688967,-0.5,0.866025,-0.258819,-0.965926,0.974928,-0.222521,"[-0.0002051636838587001, 3.6838573578279465e-0..."
3,"[johncena, foryoupage, foryoupage, viral, come...",6,7931,6,24,51200,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,0.965926,0.781831,0.62349,"[-0.0009351924693744097, 0.0006273825815047271..."
4,"[johncena, foryoupage, foryoupage, viral]",15,3229,9,14,24700,1000000,4700000,211,2020,0.571268,0.820763,-0.5,0.866025,-0.258819,-0.965926,0.781831,0.62349,"[-0.0004166239232290536, 0.0001653323088248726..."
