In [1]:
!pip install matplotlib



In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [3]:
df = pd.read_csv("dataset_free-tiktok-scraper_2025-05-12_19-19-34-635.csv")

DATA PREPARATION

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443 entries, 0 to 442
Columns: 770 entries, authorMeta/avatar to webVideoUrl
dtypes: bool(8), float64(145), int64(18), object(599)
memory usage: 2.6+ MB


In [5]:
df.columns

Index(['authorMeta/avatar', 'authorMeta/bioLink', 'authorMeta/digg',
       'authorMeta/fans', 'authorMeta/following', 'authorMeta/friends',
       'authorMeta/heart', 'authorMeta/id', 'authorMeta/name',
       'authorMeta/nickName',
       ...
       'videoMeta/subtitleLinks/64/tiktokLink',
       'videoMeta/subtitleLinks/64/version',
       'videoMeta/subtitleLinks/65/downloadLink',
       'videoMeta/subtitleLinks/65/language',
       'videoMeta/subtitleLinks/65/source',
       'videoMeta/subtitleLinks/65/sourceUnabbreviated',
       'videoMeta/subtitleLinks/65/tiktokLink',
       'videoMeta/subtitleLinks/65/version', 'videoMeta/width', 'webVideoUrl'],
      dtype='object', length=770)

In [6]:
# Merge all hashtags columns into one
hashtag_cols = [col for col in df.columns if col.startswith('hashtags/') and col.endswith('/name')]

In [7]:
df['hashtags'] = df[hashtag_cols].astype(str).agg(', '.join, axis=1)

In [8]:
# Delete hashtag columns afer merging
df.drop(columns=hashtag_cols, inplace=True)

In [9]:
# Remove columns with missing values
df_cleaned = df.dropna(axis=1)

In [10]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443 entries, 0 to 442
Data columns (total 43 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   authorMeta/avatar                 443 non-null    object
 1   authorMeta/digg                   443 non-null    int64 
 2   authorMeta/fans                   443 non-null    int64 
 3   authorMeta/following              443 non-null    int64 
 4   authorMeta/friends                443 non-null    int64 
 5   authorMeta/heart                  443 non-null    int64 
 6   authorMeta/id                     443 non-null    int64 
 7   authorMeta/name                   443 non-null    object
 8   authorMeta/nickName               443 non-null    object
 9   authorMeta/originalAvatarUrl      443 non-null    object
 10  authorMeta/privateAccount         443 non-null    bool  
 11  authorMeta/profileUrl             443 non-null    object
 12  authorMeta/ttSeller   

In [11]:
df_cleaned['hashtags']

0      greenscreensticker, bookreview, jodipicoult, s...
1      nan, fablepartner, booktok, booklover, bookish...
2      nan, fable, bookrecommendations, bookclubtikto...
3      nan, booktracker, booktrackerapp, bookish, boo...
4      booktok, books, bookish, goodreads, storygraph...
                             ...                        
438    nan, fable, goodreads, booktracker, booktok, k...
439    nan, nan, nan, nan, nan, nan, nan, nan, nan, n...
440    bookrecs, booktok, bookrecommendations, favori...
441    nan, booktok, booktokgirlies, readers, fable, ...
442    fable, fableapp, booktok, books, nan, nan, nan...
Name: hashtags, Length: 443, dtype: object

In [12]:
# Drop irrelevant columns 
columns_to_drop = [
    'authorMeta/avatar',
    'authorMeta/digg',
    'authorMeta/following',
    'authorMeta/friends',
    'authorMeta/heart',
    'authorMeta/id',
    'authorMeta/nickName',
    'authorMeta/originalAvatarUrl',
    'authorMeta/profileUrl',
    'authorMeta/ttSeller',
    'authorMeta/video',
    'createTime',
    'id',
    'isAd',
    'isMuted',
    'isPinned',
    'isSlideshow',
    'musicMeta/coverMediumUrl',
    'musicMeta/musicAuthor',
    'musicMeta/musicId',
    'musicMeta/musicName',
    'musicMeta/musicOriginal',
    'musicMeta/originalCoverMediumUrl',
    'videoMeta/coverUrl',
    'videoMeta/definition',
    'videoMeta/format',
    'videoMeta/height',
    'videoMeta/originalCoverUrl',
    'videoMeta/width',
    'webVideoUrl'
]

In [13]:
df_cleaned = df_cleaned.drop(columns=columns_to_drop)

In [14]:
df_cleaned = df_cleaned.copy()
df_cleaned['createTimeISO'] = pd.to_datetime(df_cleaned['createTimeISO'], errors='coerce')
print(df_cleaned['createTimeISO'].dtype)

datetime64[ns, UTC]


In [15]:
# create new columns to help with analysis on DATETIME
df_cleaned.loc[:, 'year'] = df_cleaned['createTimeISO'].dt.year
df_cleaned.loc[:, 'month'] = df_cleaned['createTimeISO'].dt.month
df_cleaned.loc[:, 'day'] = df_cleaned['createTimeISO'].dt.day
df_cleaned.loc[:, 'weekday'] = df_cleaned['createTimeISO'].dt.day_name()
df_cleaned.loc[:, 'hour'] = df_cleaned['createTimeISO'].dt.hour

In [16]:
# Drop initial datetime format column
df_cleaned = df_cleaned.drop(columns='createTimeISO')

In [17]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443 entries, 0 to 442
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   authorMeta/fans            443 non-null    int64 
 1   authorMeta/name            443 non-null    object
 2   authorMeta/privateAccount  443 non-null    bool  
 3   authorMeta/verified        443 non-null    bool  
 4   collectCount               443 non-null    int64 
 5   commentCount               443 non-null    int64 
 6   diggCount                  443 non-null    int64 
 7   playCount                  443 non-null    int64 
 8   shareCount                 443 non-null    int64 
 9   textLanguage               443 non-null    object
 10  videoMeta/duration         443 non-null    int64 
 11  hashtags                   443 non-null    object
 12  year                       443 non-null    int32 
 13  month                      443 non-null    int32 
 14  day       