## Load Data

In [135]:
import pandas as pd

In [136]:
df = pd.read_excel('../dataset/main.xlsx')
df.shape

(36791, 18)

In [137]:
df['views'] = df.pop('views')

In [138]:
df.head(3)

Unnamed: 0,trending_date,title,channel_title,category_id,publish_time,tags,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,description,No_tags,desc_len,len_title,publish_date,views
0,2017-11-14,Sharry Mann: Cute Munda ( Song Teaser) | Parmi...,Lokdhun Punjabi,1,12:20:39,"sharry mann|""sharry mann new song""|""sharry man...",33966,798,882,False,False,False,Presenting Sharry Mann latest Punjabi Song Cu...,15,920,81,2017-11-12,1096327
1,2017-11-14,"पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं...",HJ NEWS,25,05:43:56,"पीरियड्स के समय|""पेट पर पति करता ऐसा""|""देखकर द...",735,904,0,True,False,False,"पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं...",19,2232,58,2017-11-13,590101
2,2017-11-14,Stylish Star Allu Arjun @ ChaySam Wedding Rece...,TFPC,24,15:48:08,Stylish Star Allu Arjun @ ChaySam Wedding Rece...,2011,243,149,False,False,False,Watch Stylish Star Allu Arjun @ ChaySam Weddin...,14,482,58,2017-11-12,473988


In [139]:
numeric_columns = df.select_dtypes(include=['number']).columns
categorical_columns = df.select_dtypes(include=['object', 'category', 'bool']).columns

In [140]:
df[numeric_columns].describe()

Unnamed: 0,category_id,likes,dislikes,comment_count,No_tags,desc_len,len_title,views
count,36791.0,36791.0,36791.0,36791.0,36791.0,36791.0,36791.0,36791.0
mean,21.552173,27450.69,1685.363,2714.022043,18.938463,923.079123,70.609361,1071490.0
std,6.586716,97831.29,16197.32,14978.114328,9.843531,815.038867,22.409174,3207149.0
min,1.0,0.0,0.0,0.0,1.0,3.0,5.0,4024.0
25%,23.0,879.0,109.0,83.0,12.0,368.0,53.0,125604.0
50%,24.0,3126.0,331.0,336.0,19.0,677.0,74.0,307836.0
75%,24.0,14095.0,1032.0,1314.5,25.0,1237.0,91.0,806631.5
max,43.0,2912710.0,1545017.0,827755.0,72.0,5136.0,100.0,125432200.0


In [141]:
df[numeric_columns].corr()

Unnamed: 0,category_id,likes,dislikes,comment_count,No_tags,desc_len,len_title,views
category_id,1.0,-0.127137,-0.042272,-0.035696,-0.000233,-0.148119,-0.163273,-0.183014
likes,-0.127137,1.0,0.494518,0.780445,0.0196,0.065086,-0.170257,0.850073
dislikes,-0.042272,0.494518,1.0,0.7081,0.033725,0.035031,-0.0303,0.54331
comment_count,-0.035696,0.780445,0.7081,1.0,0.020227,0.031604,-0.119092,0.674151
No_tags,-0.000233,0.0196,0.033725,0.020227,1.0,0.17776,0.041871,0.055327
desc_len,-0.148119,0.065086,0.035031,0.031604,0.17776,1.0,0.159658,0.122142
len_title,-0.163273,-0.170257,-0.0303,-0.119092,0.041871,0.159658,1.0,-0.051559
views,-0.183014,0.850073,0.54331,0.674151,0.055327,0.122142,-0.051559,1.0


Kolom no_tags dan len_title memiliki korelasi yang sangat kecil terhadap views, sehingga tidak akan memberikan pengaruh yang signifikan terhadap prediksi views. Maka dari itu no_tags dan len_title layak untuk di drop 

In [142]:
df[categorical_columns].describe()

Unnamed: 0,trending_date,title,channel_title,publish_time,tags,comments_disabled,ratings_disabled,video_error_or_removed,description
count,36791,36791,36791,36791,36791,36791,36791,36791,36746
unique,205,16431,1390,12375,12463,2,2,2,13979
top,2017-12-23,Mission: Impossible - Fallout (2018) - Officia...,VikatanTV,04:30:00,[none],False,False,False,Subscribers Link: http://bit.ly/2qb69dZ\n\nCon...
freq,199,19,284,333,1120,35611,36034,36780,166


Kolom comments_disabled, ratings_disabled, video_error_or_removed merupakan kategorikal biner dan terlalu dominan pada satu kategori saja. Maka dari itu, kolom ini kemungkinan besar tidak memberikan dampak yang signifikan terhadap data dan layak untuk di drop

## Feature Enginering

### Dropping low-significant feature

In [143]:
df = df.drop(columns=['comments_disabled', 'ratings_disabled', 'video_error_or_removed', 'No_tags', 'len_title'])

### Dropping Text feature

Kolom title, tag, dan description merupakan feature kalimat yang tidak dapat diproses oleh model. Maka dari itu untuk pemmbuatan model kali ini, kolom-kolom ini akan di drop 

In [144]:
df = df.drop(columns=['title', 'tags', 'description'])

### Ranking Channel Title

Channel title merupakan column kategorikal yang harus di preprocess untuk menjadi numerik agar bisa dimengerti oleh model. 

Terdapat 1390 jenis channel title sehingga one hot encodding tidak bisa dilakukan karena akan membuat dimensi fitur menjadi sangat besar. 

Oleh karena itu feature engineering akan dilakukan dengan menggunakan label encoding berdasarkan urutan rata-rata views tiap channel dari yang terbesar hingga terkecil. 

In [145]:
df_channel = df.groupby('channel_title').agg({
    'views' : ['mean']
}).reset_index()

df_channel_sorted = df_channel.sort_values(by=('views', 'mean'), ascending=False)
df_channel_sorted = df_channel_sorted.reset_index(drop=True)

df_channel_sorted

Unnamed: 0_level_0,channel_title,views
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
0,YouTube Spotlight,8.226435e+07
1,TaylorSwiftVEVO,2.864121e+07
2,Sony Pictures Entertainment,2.271359e+07
3,FoxStarHindi,2.251442e+07
4,Marvel Entertainment,1.969840e+07
...,...,...
1385,Pakkatv,1.344300e+04
1386,Reporter Roy,1.181600e+04
1387,Viral in India,1.139100e+04
1388,Wide Angle Pictures,1.133700e+04


In [146]:
channel_label_dict = dict(zip(df_channel_sorted['channel_title'], df_channel_sorted.index + 1))

In [147]:
df['channel_title'] = df['channel_title'].map(channel_label_dict)
df.head(3)

Unnamed: 0,trending_date,channel_title,category_id,publish_time,likes,dislikes,comment_count,desc_len,publish_date,views
0,2017-11-14,101,1,12:20:39,33966,798,882,920,2017-11-12,1096327
1,2017-11-14,149,25,05:43:56,735,904,0,2232,2017-11-13,590101
2,2017-11-14,708,24,15:48:08,2011,243,149,482,2017-11-12,473988


Channel title sudah bertipe numeric

### Datetime Feature Engineering

In [148]:
df['publish_time'] = pd.to_datetime(df['publish_time'], format='%H:%M:%S', errors='coerce')
df['trending_date'] = pd.to_datetime(df['trending_date'], errors='coerce')

hari, bulan, dan tahun dapat diambil dari Date waktu trending dan waktu publish  

In [150]:
df['trending_day'] = df['trending_date'].dt.day
df['trending_month'] = df['trending_date'].dt.month
df['trending_year'] = df['trending_date'].dt.year

df['publish_day'] = df['publish_date'].dt.day
df['publish_month'] = df['publish_date'].dt.month
df['publish_year'] = df['publish_date'].dt.year

Selain itu pada publish hour, akan diambil waktu jamnya saja 

In [151]:
df['publish_hour'] = df['publish_time'].dt.hour

Setelah melakukan ekstraksi, kolom lama akan dihapus karena bertipe datetime dan sudah tidak dipakai lagi

In [None]:
df = df.drop(columns=['trending_date', 'publish_time', 'publish_date'])

Berikut adalah hasil akhir dari feature engineering

In [153]:
df

Unnamed: 0,channel_title,category_id,likes,dislikes,comment_count,desc_len,views,trending_day,trending_month,trending_year,publish_day,publish_month,publish_year,publish_hour
0,101,1,33966,798,882,920,1096327,14,11,2017,12,11,2017,12
1,149,25,735,904,0,2232,590101,14,11,2017,13,11,2017,5
2,708,24,2011,243,149,482,473988,14,11,2017,12,11,2017,15
3,224,23,70353,1624,2684,263,1242680,14,11,2017,12,11,2017,7
4,431,24,492,293,66,753,464015,14,11,2017,13,11,2017,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36786,222,23,3291,404,196,818,214378,14,6,2018,13,6,2018,8
36787,801,24,1726,478,1428,468,406828,14,6,2018,13,6,2018,11
36788,572,24,1216,453,697,2187,386319,14,6,2018,13,6,2018,5
36789,1073,24,698,115,65,432,130263,14,6,2018,13,6,2018,15


In [160]:
df.to_excel('../dataset/preprocess.xlsx')