In [191]:
import pandas as pd

***Load Data***

In [192]:
df = pd.read_csv("../dataCollection/TikTok/top_1000_tiktokers.csv")
df

Unnamed: 0,Rank,Username,Tiktok Link,Name,Subscribers Count,Views. Avg,Likes. Avg,Comments. Avg,Shares. Avg
0,1,badbunny,https://www.tiktok.com/@badbunny,Bad Bunny,23.4M,63.1M,10.3M,65.2K,108.6K
1,2,noahschnapp,https://www.tiktok.com/@noahschnapp,Noah Schnapp,26.1M,23.4M,4.8M,46.7K,22.1K
2,3,khaby.lame,https://www.tiktok.com/@khaby.lame,Khabane lame,145.9M,47.5M,2.5M,46K,9.4K
3,4,coopernoriega,https://www.tiktok.com/@coopernoriega,Cooper Noriega,3.2M,31M,1.3M,120.9K,8.4K
4,5,avrillavigne,https://www.tiktok.com/@avrillavigne,Avril Lavigne,4M,24.7M,3.5M,32.1K,22.7K
...,...,...,...,...,...,...,...,...,...
995,996,daraarafah24,https://www.tiktok.com/@daraarafah24,daraarafah,802.4K,2.1M,151.3K,1.7K,1K
996,997,straykids_japan,https://www.tiktok.com/@straykids_japan,straykids_japan,7.6M,718.5K,195.8K,4K,2.3K
997,998,fiersabesari,https://www.tiktok.com/@fiersabesari,Fiersa Besari,1.7M,1.1M,108.6K,395,6.9K
998,999,mrgrandeofficial,https://www.tiktok.com/@mrgrandeofficial,Mikey Angelo,2.1M,1.1M,227.4K,1.3K,2.1K


***Data Preparation***

In this section we are looking to complete the following...

- Remove data that is not needed or biased
- Convert values into numerical values
- Seperate our data into x and y

In [193]:
df = df.drop('Tiktok Link', axis=1)
df = df.drop('Name', axis=1)
df =  df.drop('Rank', axis=1)
df.dtypes

Username             object
Subscribers Count    object
Views. Avg           object
Likes. Avg           object
Comments. Avg        object
Shares. Avg          object
dtype: object

In [194]:
# Turn the analytics into numbers
def convert_to_number(s):
    if 'M' in s:
        return float(s.replace('M', '')) * 1e6
    elif 'K' in s:
        return float(s.replace('K', '')) * 1e3
    else:
        return float(s)
    
columns_to_convert = ['Subscribers Count', 'Views. Avg', 'Likes. Avg', 'Comments. Avg', 'Shares. Avg']
for col in columns_to_convert:
    df[col] = df[col].apply(convert_to_number)
    
df

Unnamed: 0,Username,Subscribers Count,Views. Avg,Likes. Avg,Comments. Avg,Shares. Avg
0,badbunny,23400000.0,63100000.0,10300000.0,65200.0,108600.0
1,noahschnapp,26100000.0,23400000.0,4800000.0,46700.0,22100.0
2,khaby.lame,145900000.0,47500000.0,2500000.0,46000.0,9400.0
3,coopernoriega,3200000.0,31000000.0,1300000.0,120900.0,8400.0
4,avrillavigne,4000000.0,24700000.0,3500000.0,32100.0,22700.0
...,...,...,...,...,...,...
995,daraarafah24,802400.0,2100000.0,151300.0,1700.0,1000.0
996,straykids_japan,7600000.0,718500.0,195800.0,4000.0,2300.0
997,fiersabesari,1700000.0,1100000.0,108600.0,395.0,6900.0
998,mrgrandeofficial,2100000.0,1100000.0,227400.0,1300.0,2100.0


In [195]:
df.dtypes

Username              object
Subscribers Count    float64
Views. Avg           float64
Likes. Avg           float64
Comments. Avg        float64
Shares. Avg          float64
dtype: object

In [196]:
# Turn the TikTokUsernames into numbers
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Username'] = label_encoder.fit_transform(df['Username'])

df

Unnamed: 0,Username,Subscribers Count,Views. Avg,Likes. Avg,Comments. Avg,Shares. Avg
0,100,23400000.0,63100000.0,10300000.0,65200.0,108600.0
1,704,26100000.0,23400000.0,4800000.0,46700.0,22100.0
2,511,145900000.0,47500000.0,2500000.0,46000.0,9400.0
3,198,3200000.0,31000000.0,1300000.0,120900.0,8400.0
4,92,4000000.0,24700000.0,3500000.0,32100.0,22700.0
...,...,...,...,...,...,...
995,216,802400.0,2100000.0,151300.0,1700.0,1000.0
996,864,7600000.0,718500.0,195800.0,4000.0,2300.0
997,316,1700000.0,1100000.0,108600.0,395.0,6900.0
998,665,2100000.0,1100000.0,227400.0,1300.0,2100.0


In [197]:
from sklearn.model_selection import train_test_split

y = df['Username']
y

0      100
1      704
2      511
3      198
4       92
      ... 
995    216
996    864
997    316
998    665
999    717
Name: Username, Length: 1000, dtype: int64

In [198]:
x = df.drop('Username', axis=1)
x

Unnamed: 0,Subscribers Count,Views. Avg,Likes. Avg,Comments. Avg,Shares. Avg
0,23400000.0,63100000.0,10300000.0,65200.0,108600.0
1,26100000.0,23400000.0,4800000.0,46700.0,22100.0
2,145900000.0,47500000.0,2500000.0,46000.0,9400.0
3,3200000.0,31000000.0,1300000.0,120900.0,8400.0
4,4000000.0,24700000.0,3500000.0,32100.0,22700.0
...,...,...,...,...,...
995,802400.0,2100000.0,151300.0,1700.0,1000.0
996,7600000.0,718500.0,195800.0,4000.0,2300.0
997,1700000.0,1100000.0,108600.0,395.0,6900.0
998,2100000.0,1100000.0,227400.0,1300.0,2100.0


In [199]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=100)

***Training The Model***

In [200]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

-0.09987615825226692

***Predictions***