### 1.1 Data Overviews

> Name of dataset: USVideos (Mainly about trending youtube videos in US from 14.11.2017 -> 14.06.2018)

### 1.2 Attribute-information
>	video_id-Unique video id

>	trending_date-the date at which video start trending

>	title-Title of video

>	channel_title-video posted by channel

>	category_id-there are 32 Category value

>	publish_time-at what time video is uplaoded

>	tags-tag given to video

>	views-no of views

>	likes-no of likes

>	dislikes-no of dislikes

>	comment_count-no of comment

### 1.3 EDA

In [None]:
#import libraries
import pandas as pd
import numpy as np  
import json
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
youtube=pd.read_csv('USvideos.csv')
youtube.head()

> Xử lí json để nhập vào thể loại video dựa trên id video

In [None]:
with open('US_category_id.json') as f:
    categories = json.load(f)['items']
category_name = {}
for category in categories:
    category_name[int(category['id'])] = category['snippet']['title']
youtube['category_name'] = youtube['category_id'].map(category_name)

In [None]:
youtube.head()

In [None]:
# Thống kê giá trị độc nhất ở mỗi column
youtube.apply(lambda x: len(x.unique()))

In [None]:
youtube.describe()

In [None]:
# Số lượng video bị tắt bình luận, đánh giá, số video bị xóa hoặc lỗi và số lượng video theo từng thể loại
i=1
fig=plt.figure(figsize=(15,15))
for x in (['comments_disabled','ratings_disabled','video_error_or_removed','category_id']):
    count=youtube[x].value_counts()
    fig.add_subplot(2,2,i)
    sns.barplot(x=count.index, y=count.values, alpha=0.8)
    plt.title('{} vs No of video'.format(x))
    plt.ylabel('No of video')
    plt.xlabel('{}'.format(x))
    i+=1
plt.show()

### 1.3 Feature Engineering

In [None]:
#No of tags
tags=[x.count("|")+1 for x in youtube["tags"]]
youtube["No_tags"]=tags

In [None]:
for i in youtube["description"]:
    print(len(str(i)))

In [None]:
#length of title
title_len=[len(x) for x in youtube["title"]]
youtube["len_title"]=title_len

In [None]:
publish_time = pd.to_datetime(youtube['publish_time'], format='%Y-%m-%dT%H:%M:%S.%fZ')
youtube['publish_time'] = publish_time.dt.time
youtube['publish_date'] = publish_time.dt.date

#day at which video is publish
youtube['publish_weekday']=publish_time.dt.day_name()

In [None]:
#ratio of view/likes  upto 3 decimal
youtube["Ratio_View_likes"]=round(youtube["views"]/youtube["likes"],3)
#ratio of view/dislikes  upto 3 decimal
youtube["Ratio_View_dislikes"]=round(youtube["views"]/youtube["dislikes"],3)
#ratio of view/comment_count  upto 3 decimal
youtube["Ratio_views_comment_count"]=round(youtube["views"]/youtube["comment_count"],3)
#ratio of likes/dislikes  upto 3 decimal
youtube["Ratio_likes_dislikes"]=round(youtube["likes"]/youtube["dislikes"],3)
#removing the infinite values
youtube=youtube.replace([np.inf, -np.inf], np.nan)
youtube=youtube.dropna(how='any',axis=0)

In [None]:
print(max(youtube["Ratio_View_likes"]))
print(max(youtube["Ratio_View_dislikes"]))
print(max(youtube["Ratio_views_comment_count"]))
print(max(youtube["Ratio_likes_dislikes"]))

In [None]:
youtube['publish_weekday'] = youtube['publish_weekday'].replace({'Monday':1,
                                                             'Tuesday':2,
                                                             'Wednesday':3,
                                                             'Thursday':4,
                                                             'Friday':5,
                                                             'Saturday':6,
                                                             'Sunday':7})

In [None]:
count=youtube["publish_weekday"].value_counts()
print(count)
plt.figure(figsize=(7,7))
sns.barplot(x=count.index, y=count.values, alpha=0.8)
plt.title('No of videos vs weekdays')
plt.ylabel('no of videos')
plt.xlabel('weekdays')
plt.show()

In [None]:
youtube.head()

### 1.4 Correlation Matrix

> Sự tương quan giữa các dữ liệu views, likes, dislikes, comment_count

In [None]:
data = youtube
corr = data.corr()
plt.figure(figsize=(12, 12))
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
youtube.head()

In [None]:
### Removing non Correlated coloumns
youtube.drop(['video_id', 'trending_date', 'title', 'channel_title', 'publish_time', 'tags', 'thumbnail_link', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 'description', 'category_name'], inplace = True ,axis = 1)
youtube.head()


In [None]:
youtube.drop(['publish_date'], axis=1, inplace=True)


In [None]:
youtube.head()

In [None]:
data.isnull().sum()

#### Không có giá trị NaN nên không cần xử lý dữ liệu trống
#### Xử lí dữ liệu ngoại lệ 

In [None]:
# hàm vẽ histogram để nhận biết dạng phân bố
def histogram(data):
    fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(nrows=2, ncols=2)
    n_bins=30
    ax0.hist(data['views'], n_bins, density=True, histtype='bar')
    ax0.set_title('views')
    ax1.hist(data['likes'], n_bins, density=True, histtype='bar')
    ax1.set_title('likes')
    ax2.hist(data['dislikes'], n_bins, histtype='bar')
    ax2.set_title('dislikes')
    ax3.hist(data['comment_count'], n_bins, histtype='bar')
    ax3.set_title('comment_count')
    fig.tight_layout()
    plt.show()

In [None]:
# vẽ dữ liệu cột views, likes, dislikes, comment_count và tiến hành xử lý ngoại lệ
histogram(data)

#### Dữ liệu của cột views, likes, dislikes, comment_count có dạng phân bố lệch (skewed) nên ta có: 
* ==> + Biên trên = 3rd Quantile + 3*IQR
* ==> + Biên dưới =  1st Quantile - 3*IQR
* IQR: Interquantile range
    * 3rd Quantile = Percentile 75
    * 1st Quantile = Percentile 25

In [None]:
# Xử lý ngoại lệ cho cột likes
IQR_likes = data['likes'].quantile(0.75) - data['likes'].quantile(0.25)  
ub_likes = data['likes'].quantile(0.75) + 3 * IQR_likes 
lb_likes = data['likes'].quantile(0.25) - 3 * IQR_likes 
print(ub_likes)
print(lb_likes)

In [None]:
# Xử lý ngoại lệ cho cột dislikes
IQR_dislikes = data['dislikes'].quantile(0.75) - data['dislikes'].quantile(0.25)  
ub_dislikes = data['dislikes'].quantile(0.75) + 3 * IQR_dislikes
lb_dislikes = data['dislikes'].quantile(0.25) - 3 * IQR_dislikes
print(ub_dislikes)
print(lb_dislikes)

In [None]:
# Xử lý ngoại lệ cho cột comment_count
IQR_comments = data['comment_count'].quantile(0.75) - data['comment_count'].quantile(0.25)  
ub_comments = data['comment_count'].quantile(0.75) + 3 * IQR_comments
lb_comments = data['comment_count'].quantile(0.25) - 3 * IQR_comments
print(ub_comments)
print(lb_comments)

In [None]:
# Xử lý ngoại lệ cho cột views
IQR_views = data['views'].quantile(0.75) - data['views'].quantile(0.25)  
ub_views = data['views'].quantile(0.75) + 3 * IQR_views
lb_views = data['views'].quantile(0.25) - 3 * IQR_views
print(ub_views)
print(lb_views)

In [None]:
# mang du lieu da qua xu li ngoai le
data_copy = data.copy()
# mang du lieu chua qua xu li ngoai le
data_copy_non_pr = data.copy()

In [None]:
# Xác định các giá trị biên trên và biên dưới của dữ liệu
# Thay thế giá trị ngoại lệ bằng 1 trong 2 giá trị trên
data_copy.loc[data_copy['views'] >= ub_views, 'views'] = ub_views
data_copy.loc[data_copy['likes'] >= ub_likes, 'likes'] = ub_likes
data_copy.loc[data_copy['dislikes'] >= ub_dislikes, 'dislikes'] = ub_dislikes
data_copy.loc[data_copy['comment_count'] >= ub_comments, 'comment_count'] = ub_comments

In [None]:
histogram(data_copy)

### PREDICTING VIEWS

In [None]:
data_drop_view=data_copy.drop(['views'],axis=1,inplace=False)
data_non_pr_drop_view=data_copy_non_pr.drop(['views'],axis=1,inplace=False)

In [None]:
handle_accuracy = []
non_handle_accuracy = []

In [None]:
model = LinearRegression()
for random_state in range(10):
    # 1 la da duoc xu li
    # 2 la chua duoc xu li
    train1,test1,y_train1,y_test1=train_test_split(data_drop_view,data_copy['views'], test_size=0.2,shuffle=False, random_state=random_state)
    train2,test2,y_train2,y_test2=train_test_split(data_non_pr_drop_view,data_copy_non_pr['views'], test_size=0.2,shuffle=False, random_state=random_state)
    model.fit(train1, y_train1) 
    y_pred1 = model.predict(test1)

    model.fit(train2, y_train2) 
    y_pred2 = model.predict(test2)
    handle_accuracy.append(r2_score(y_test1, y_pred1))
    non_handle_accuracy.append(r2_score(y_test2, y_pred2))

In [None]:
d1 = {'True Labels': y_test2, 'Predicted Labels': y_pred2}
SK = pd.DataFrame(data = d1)
print(SK)
lm1 = sns.lmplot(x="True Labels", y="Predicted Labels", data = SK, height = 10)
fig1 = lm1.fig 
fig1.suptitle("Sklearn ", fontsize=18)
sns.set(font_scale = 1.5)

In [None]:
print("Accuracy: ", mean(non_handle_accuracy))
print("Accuracy after handle exception: ", mean(handle_accuracy))

> Nhận xét:
Việc xử lí ngoại lệ outliers có thể làm giảm hiệu suất của thuật toán LinearRegression (làm mất tính tổng quát)

### Feature Scaling

In [None]:
# Chuan hoa du lieu MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
model = LinearRegression()

for random_state in range(10):
    # 1 la da duoc xu li
    # 2 la chua duoc xu li
    train1,test1,y_train1,y_test1=train_test_split(data_drop_view,data_copy['views'], test_size=0.2,shuffle=False, random_state=random_state)
    train2,test2,y_train2,y_test2=train_test_split(data_non_pr_drop_view,data_copy_non_pr['views'], test_size=0.2,shuffle=False, random_state=random_state)
    # train1 = scaler.fit_transform(train1)
    # test1 = scaler.transform(test1)
    # train2 = scaler.transform(train2)
    # test2 = scaler.transform(test2)

    model.fit(train1, y_train1) 
    y_pred1 = model.predict(test1)

    model.fit(train2, y_train2) 
    y_pred2 = model.predict(test2)
    handle_accuracy.append(r2_score(y_test1, y_pred1))
    non_handle_accuracy.append(r2_score(y_test2, y_pred2))

In [None]:
print("Accuracy: ", mean(non_handle_accuracy))
print("Accuracy after handle exception: ", mean(handle_accuracy))

### Raindom Forest

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV
# nEstimator = [140,160,180,200,220]
# depth = [10,15,20,25,30]

# RF = RandomForestRegressor()
# hyperParam = [{'n_estimators':nEstimator,'max_depth': depth}]
# gsv = GridSearchCV(RF,hyperParam,cv=5,verbose=1,scoring='r2',n_jobs=-1)
# gsv.fit(train2, y_train2)
# print("Best HyperParameter: ",gsv.best_params_)
# print(gsv.best_score_)
# scores = gsv.cv_results_['mean_test_score'].reshape(len(nEstimator),len(depth))
# plt.figure(figsize=(8, 8))
# plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
# plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot)
# plt.xlabel('n_estimators')
# plt.ylabel('max_depth')
# plt.colorbar()
# plt.xticks(np.arange(len(nEstimator)), nEstimator)
# plt.yticks(np.arange(len(depth)), depth)
# plt.title('Grid Search r^2 Score')
# plt.show()
# maxDepth=gsv.best_params_['max_depth']
# nEstimators=gsv.best_params_['n_estimators']

In [None]:
# from sklearn.ensemble import RandomForestRegressor

# model = RandomForestRegressor(n_estimators = nEstimators,max_depth=maxDepth)
# model.fit(train2, y_train2)


# # predicting the  test set results
# y_pred = model.predict(test2)
# print('Root means score', np.sqrt(mean_squared_error(y_test2, y_pred)))
# print('Variance score: %.2f' % r2_score(y_test2, y_pred))
# print("Result :",model.score(test2, y_test2))
# d1 = {'True Labels': y_test2, 'Predicted Labels': y_pred}
# SK = pd.DataFrame(data = d1)
# print(SK)