In [1]:
import pandas as pd

***Load Data***

In [2]:
df = pd.read_csv("../dataCollection/TikTok/top-250-tiktokers.csv")
df

Unnamed: 0,Rank,Username,Country,Followers,Views,Likes,Engagement,Brand Account,Gender,Age,Ethnicity,Famous,Genre,LGBTQ
0,1,@charlidamelio,U.S.A,78.9m,38.3m,6.1m,16.60%,0,Female,16.0,White,0.0,"Dancing, Lipsyncing, Lifestyle",0.0
1,2,@addisonre,U.S.A,53.7m,13.8m,2.8m,20.80%,0,Female,19.0,White,0.0,"Dancing, Lipsyncing",0.0
2,3,@zachking,U.S.A,47.3m,17.6m,2.8m,16%,0,Male,30.0,East Asian,1.0,"Comedy, Illusion",0.0
3,4,@lorengray,U.S.A,46.3m,3.2m,622.6k,19.70%,0,Female,18.0,White,1.0,"Dancing, Lipsyncing",0.0
4,5,@tiktok,U.S.A,45.3m,9.4m,503.0k,5.60%,1,,,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,252,@ramneeksingh1313,India,8.6m,479.1k,31.6k,6.70%,0,Male,35.0,South Asian,0.0,"Comedy, Acting",0.0
252,253,@amandacerny,U.S.A,8.6m,654.7k,95.2k,14.60%,0,Female,29.0,White,1.0,"Comedy, Lifestyle",0.0
253,254,@rahimabram,Russia,8.6m,1.6m,271.2k,17%,0,Male,22.0,White,1.0,"Lipsyncing, Promotion, Lifestyle",0.0
254,255,@johnnyorlando,U.S.A,8.6m,1.1m,245.4k,22.70%,0,Male,17.0,White,1.0,"Lipsyncing, Promotion, Lifestyle",0.0


***Data Preparation***

In this section we are looking to complete the following...

- Remove data that is not needed or biased
- Convert values into numerical values
- Handle Null/NaN values
- Seperate our data into x and y

In [3]:
df = df.drop('Rank', axis=1)
df = df.drop('LGBTQ', axis=1)
df = df.drop('Ethnicity', axis=1)
df = df.drop('Famous', axis=1)
df = df.drop('Gender', axis=1)
df = df.drop('Brand Account', axis=1)
df = df.drop('Age', axis=1)
df = df.drop('Username', axis=1)
# df = df.drop('Engagement', axis=1)
# df = df.drop('Country', axis=1)
df

Unnamed: 0,Country,Followers,Views,Likes,Engagement,Genre
0,U.S.A,78.9m,38.3m,6.1m,16.60%,"Dancing, Lipsyncing, Lifestyle"
1,U.S.A,53.7m,13.8m,2.8m,20.80%,"Dancing, Lipsyncing"
2,U.S.A,47.3m,17.6m,2.8m,16%,"Comedy, Illusion"
3,U.S.A,46.3m,3.2m,622.6k,19.70%,"Dancing, Lipsyncing"
4,U.S.A,45.3m,9.4m,503.0k,5.60%,
...,...,...,...,...,...,...
251,India,8.6m,479.1k,31.6k,6.70%,"Comedy, Acting"
252,U.S.A,8.6m,654.7k,95.2k,14.60%,"Comedy, Lifestyle"
253,Russia,8.6m,1.6m,271.2k,17%,"Lipsyncing, Promotion, Lifestyle"
254,U.S.A,8.6m,1.1m,245.4k,22.70%,"Lipsyncing, Promotion, Lifestyle"


In [4]:
df.dtypes

Country       object
Followers     object
Views         object
Likes         object
Engagement    object
Genre         object
dtype: object

In [5]:
# Turn the analytics into numbers

# Followers, Views, Like
def convert_to_number(s):
    if 'm' in s:
        return float(s.replace('m', '')) * 1e6
    elif 'k' in s:
        return float(s.replace('k', '')) * 1e3
    else:
        return float(s)
    
columns_to_convert = ['Followers', 'Views', 'Likes']
for col in columns_to_convert:
    df[col] = df[col].apply(convert_to_number)
    
# Engagement
df['Engagement'] = df['Engagement'].str.rstrip('%')

df['Engagement'] = df['Engagement'].astype('float') / 100.0
    
df

Unnamed: 0,Country,Followers,Views,Likes,Engagement,Genre
0,U.S.A,78900000.0,38300000.0,6100000.0,0.166,"Dancing, Lipsyncing, Lifestyle"
1,U.S.A,53700000.0,13800000.0,2800000.0,0.208,"Dancing, Lipsyncing"
2,U.S.A,47300000.0,17600000.0,2800000.0,0.160,"Comedy, Illusion"
3,U.S.A,46300000.0,3200000.0,622600.0,0.197,"Dancing, Lipsyncing"
4,U.S.A,45300000.0,9400000.0,503000.0,0.056,
...,...,...,...,...,...,...
251,India,8600000.0,479100.0,31600.0,0.067,"Comedy, Acting"
252,U.S.A,8600000.0,654700.0,95200.0,0.146,"Comedy, Lifestyle"
253,Russia,8600000.0,1600000.0,271200.0,0.170,"Lipsyncing, Promotion, Lifestyle"
254,U.S.A,8600000.0,1100000.0,245400.0,0.227,"Lipsyncing, Promotion, Lifestyle"


In [6]:
# Check how many column values are null
df.isna().sum()

Country       0
Followers     0
Views         0
Likes         0
Engagement    0
Genre         2
dtype: int64

In [7]:
# Drop the rows that have missing values

# df.dropna(subset=["Genre"], inplace=True)
# df.isna().sum()


# OR

# Fill missing values

df["Genre"] = df["Genre"].fillna("noGenre")

In [8]:
df.isna().sum()

Country       0
Followers     0
Views         0
Likes         0
Engagement    0
Genre         0
dtype: int64

In [9]:
df

Unnamed: 0,Country,Followers,Views,Likes,Engagement,Genre
0,U.S.A,78900000.0,38300000.0,6100000.0,0.166,"Dancing, Lipsyncing, Lifestyle"
1,U.S.A,53700000.0,13800000.0,2800000.0,0.208,"Dancing, Lipsyncing"
2,U.S.A,47300000.0,17600000.0,2800000.0,0.160,"Comedy, Illusion"
3,U.S.A,46300000.0,3200000.0,622600.0,0.197,"Dancing, Lipsyncing"
4,U.S.A,45300000.0,9400000.0,503000.0,0.056,noGenre
...,...,...,...,...,...,...
251,India,8600000.0,479100.0,31600.0,0.067,"Comedy, Acting"
252,U.S.A,8600000.0,654700.0,95200.0,0.146,"Comedy, Lifestyle"
253,Russia,8600000.0,1600000.0,271200.0,0.170,"Lipsyncing, Promotion, Lifestyle"
254,U.S.A,8600000.0,1100000.0,245400.0,0.227,"Lipsyncing, Promotion, Lifestyle"


In [10]:
# Turn genres into numerical values using hot encoding and MultiLabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer

# Split the Genre column into separate genres
df['Genre'] = df['Genre'].str.split(', ')

mlb = MultiLabelBinarizer()

genre_encoded = pd.DataFrame(mlb.fit_transform(df['Genre']), columns=mlb.classes_, index=df.index)

df = pd.concat([df.drop(columns=['Genre']), genre_encoded], axis=1)
df

Unnamed: 0,Country,Followers,Views,Likes,Engagement,Acting,Art,Comedy,Content House,DIY,...,Pets,Photography,Pranking,Pranking.1,Promotion,Reacting,Relationship,Reviews,Singing,noGenre
0,U.S.A,78900000.0,38300000.0,6100000.0,0.166,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,U.S.A,53700000.0,13800000.0,2800000.0,0.208,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,U.S.A,47300000.0,17600000.0,2800000.0,0.160,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,U.S.A,46300000.0,3200000.0,622600.0,0.197,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,U.S.A,45300000.0,9400000.0,503000.0,0.056,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,India,8600000.0,479100.0,31600.0,0.067,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
252,U.S.A,8600000.0,654700.0,95200.0,0.146,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
253,Russia,8600000.0,1600000.0,271200.0,0.170,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
254,U.S.A,8600000.0,1100000.0,245400.0,0.227,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
# Turn countries into numerical values using hot encoding
df = pd.get_dummies(df, columns=['Country'])

# Convert boolean columns to integer only for the newly created dummy columns
dummy_columns = [col for col in df.columns if 'Country_' in col]
df[dummy_columns] = df[dummy_columns].astype(int)

df

Unnamed: 0,Followers,Views,Likes,Engagement,Acting,Art,Comedy,Content House,DIY,Dancing,...,Country_Poland,Country_Russia,Country_South Africa,Country_South Korea,Country_Spain,Country_Turkey,Country_U.A.E.,Country_U.K.,Country_U.S.A,Country_Vietnam
0,78900000.0,38300000.0,6100000.0,0.166,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,53700000.0,13800000.0,2800000.0,0.208,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,47300000.0,17600000.0,2800000.0,0.160,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,46300000.0,3200000.0,622600.0,0.197,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,45300000.0,9400000.0,503000.0,0.056,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,8600000.0,479100.0,31600.0,0.067,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
252,8600000.0,654700.0,95200.0,0.146,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
253,8600000.0,1600000.0,271200.0,0.170,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
254,8600000.0,1100000.0,245400.0,0.227,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
df.dtypes

Followers                float64
Views                    float64
Likes                    float64
Engagement               float64
Acting                     int64
Art                        int64
Comedy                     int64
Content House              int64
DIY                        int64
Dancing                    int64
Dancing                    int64
Education                  int64
Family                     int64
Family                     int64
Fitness                    int64
Food                       int64
Highlights                 int64
Illusion                   int64
Lifestyle                  int64
Lipsyncing                 int64
Lipsyncing                 int64
Makeup                     int64
Media                      int64
Motivational Speaking      int64
Parkour                    int64
Pets                       int64
Photography                int64
Pranking                   int64
Pranking                   int64
Promotion                  int64
Reacting  

In [13]:
# Seperate into x and y
x = df.filter(regex='^(Followers|Views|Likes|Engagement|Country_)')
x

Unnamed: 0,Followers,Views,Likes,Engagement,Country_Aruba,Country_Australia,Country_Brazil,Country_Chile,Country_Colombia,Country_Ecuador,...,Country_Poland,Country_Russia,Country_South Africa,Country_South Korea,Country_Spain,Country_Turkey,Country_U.A.E.,Country_U.K.,Country_U.S.A,Country_Vietnam
0,78900000.0,38300000.0,6100000.0,0.166,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,53700000.0,13800000.0,2800000.0,0.208,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,47300000.0,17600000.0,2800000.0,0.160,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,46300000.0,3200000.0,622600.0,0.197,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,45300000.0,9400000.0,503000.0,0.056,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,8600000.0,479100.0,31600.0,0.067,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
252,8600000.0,654700.0,95200.0,0.146,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
253,8600000.0,1600000.0,271200.0,0.170,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
254,8600000.0,1100000.0,245400.0,0.227,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
df_copy = df.copy()
df_copy = df.drop(df.filter(regex='^Country_').columns, axis=1)
y = df_copy.drop(["Followers", "Views", "Likes", "Engagement"], axis=1)
y

Unnamed: 0,Acting,Art,Comedy,Content House,DIY,Dancing,Dancing.1,Education,Family,Family.1,...,Pets,Photography,Pranking,Pranking.1,Promotion,Reacting,Relationship,Reviews,Singing,noGenre
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
252,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
253,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
