In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
%run
"./include/path_folder"

In [0]:
yt_schema = StructType(fields = [StructField("vdo_id", StringType(), False),
                                    StructField('vdo_uploader', StringType(),True),
                                    StructField("vdo_interval", DoubleType(), True),
                                    StructField("vdo_category", StringType(), True),
                                    StructField("vdo_length", DoubleType(), True),
                                    StructField("vdo_views", DoubleType(), True),
                                    StructField("vdo_rating", DoubleType(), True),
                                    StructField("vdo_num_rating", DoubleType(), True),
                                    StructField("vdo_comm", DoubleType(), True),
                                    StructField("vdo_rel_id", StringType(), True)])

In [0]:
yt_df = spark.read.option('Header', True).schema(yt_schema).csv(f'{raw_path_folder}/YouTube')

In [0]:
yt_df.show()

In [0]:
rank_spec = yt_df.groupBy('vdo_category').agg(count('vdo_id')).orderBy(desc('count(vdo_id)')).withColumnRenamed('count(vdo_id)', 'Number_of_videos')

In [0]:
rank_prep = Window.orderBy(desc("Number_of_videos"))
rank_df = rank_spec.withColumn("Rank",rank().over(rank_prep))

In [0]:
display(rank_df)

vdo_category,Number_of_videos,Rank
Entertainment,908,1
Music,862,2
Comedy,414,3
People & Blogs,398,4
News & Politics,333,5
Film & Animation,260,6
Sports,251,7
Howto & Style,137,8
Travel & Events,112,9
Pets & Animals,95,10


In [0]:
vdo_ratings_df = yt_df.orderBy(desc('vdo_rating'),desc('vdo_views'),desc('vdo_num_rating'))
display(vdo_ratings_df)

vdo_id,vdo_uploader,vdo_interval,vdo_category,vdo_length,vdo_views,vdo_rating,vdo_num_rating,vdo_comm,vdo_rel_id
nUDksaQp3IA,kinxero,952.0,Entertainment,559.0,101016.0,5.0,4.0,81.0,8yMs-nvci18
K4i7PEClSF8,angelofevil69,691.0,Entertainment,238.0,75573.0,5.0,4.0,588.0,dLxWIlxDiRY
nwqXVAYsYW4,employeofdayear,1122.0,Music,291.0,28966.0,5.0,19.0,16.0,krqCi_y3z1g
hnA0VyR3HI4,u3astside,1107.0,Entertainment,373.0,22921.0,5.0,37.0,62.0,#NAME?
od1PLcqUhfs,badboyforeverdotcom,1040.0,Music,312.0,22118.0,5.0,38.0,19.0,KN_ACKracDU
Yx2iza6EfZo,taperted5150,781.0,Music,641.0,21049.0,5.0,8.0,1.0,
8o72MzrgFiU,ReelNASA,1087.0,Autos & Vehicles,599.0,17824.0,5.0,40.0,41.0,78ChatsRe6Y
gjbG5N4EfDc,ruitico34,838.0,Music,261.0,14752.0,5.0,43.0,23.0,BuNLvd77hBc
kXeamc7e31c,kipkay,1095.0,Howto & Style,72.0,14413.0,5.0,26.0,21.0,kB_gwYz5Mqs
MIQFTBsGccA,HortonGB,1116.0,Film & Animation,174.0,13103.0,5.0,42.0,45.0,TEJenuU1QKE


In [0]:
rank_df.write.mode("overwrite").partitionBy("vdo_category").parquet (f"{processed_path_folder}/YouTube/Rank")

In [0]:
vdo_ratings_df.write.mode("overwrite").partitionBy("vdo_category").parquet (f"{processed_path_folder}/YouTube/Ratings")

In [0]:
yt = spark.read.parquet(f"{processed_path_folder}/YouTube/Rank").orderBy(asc('Rank'))
display(yt)

Number_of_videos,Rank,vdo_category
908,1,Entertainment
862,2,Music
414,3,Comedy
398,4,People & Blogs
333,5,News & Politics
260,6,Film & Animation
251,7,Sports
137,8,Howto & Style
112,9,Travel & Events
95,10,Pets & Animals


In [0]:
yt_ratings = spark.read.parquet(f"{processed_path_folder}/YouTube/Ratings")
display(yt_ratings)

vdo_id,vdo_uploader,vdo_interval,vdo_length,vdo_views,vdo_rating,vdo_num_rating,vdo_comm,vdo_rel_id,vdo_category
nUDksaQp3IA,kinxero,952.0,559.0,101016.0,5.0,4.0,81.0,8yMs-nvci18,Entertainment
K4i7PEClSF8,angelofevil69,691.0,238.0,75573.0,5.0,4.0,588.0,dLxWIlxDiRY,Entertainment
hnA0VyR3HI4,u3astside,1107.0,373.0,22921.0,5.0,37.0,62.0,#NAME?,Entertainment
acG4teg-uX4,AsikTV,957.0,184.0,12513.0,5.0,16.0,6.0,8-QZSkoYxaw,Entertainment
XSAzJP8H30o,blindi1,947.0,161.0,9950.0,5.0,9.0,7.0,WdtIUTIU--w,Entertainment
OSJjtP_p-Hs,SamTubeCOM,1127.0,306.0,9433.0,5.0,1.0,31.0,zbr1EiIC2Ro,Entertainment
q-JndN8vrrE,badboy4lyfe2,968.0,366.0,9357.0,5.0,32.0,28.0,wDKaX0xkUmE,Entertainment
Hg47-CwiP-I,AsikTV,973.0,566.0,8690.0,5.0,25.0,3.0,mGE6diUx2IE,Entertainment
EohHqWBlx9Y,jani331691,794.0,469.0,7907.0,5.0,15.0,21.0,MxnyorU_Fr8,Entertainment
dISPc5wvX64,MadeinchinaReview,1111.0,225.0,7846.0,5.0,1.0,17.0,L30ePvCD8Ls,Entertainment


In [0]:
%sql
create database youtube;

In [0]:
youtube = spark.read.parquet(f"{processed_path_folder}/YouTube/Ratings")

In [0]:
display(youtube)

vdo_id,vdo_uploader,vdo_interval,vdo_length,vdo_views,vdo_rating,vdo_num_rating,vdo_comm,vdo_rel_id,vdo_category
nUDksaQp3IA,kinxero,952.0,559.0,101016.0,5.0,4.0,81.0,8yMs-nvci18,Entertainment
K4i7PEClSF8,angelofevil69,691.0,238.0,75573.0,5.0,4.0,588.0,dLxWIlxDiRY,Entertainment
hnA0VyR3HI4,u3astside,1107.0,373.0,22921.0,5.0,37.0,62.0,#NAME?,Entertainment
acG4teg-uX4,AsikTV,957.0,184.0,12513.0,5.0,16.0,6.0,8-QZSkoYxaw,Entertainment
XSAzJP8H30o,blindi1,947.0,161.0,9950.0,5.0,9.0,7.0,WdtIUTIU--w,Entertainment
OSJjtP_p-Hs,SamTubeCOM,1127.0,306.0,9433.0,5.0,1.0,31.0,zbr1EiIC2Ro,Entertainment
q-JndN8vrrE,badboy4lyfe2,968.0,366.0,9357.0,5.0,32.0,28.0,wDKaX0xkUmE,Entertainment
Hg47-CwiP-I,AsikTV,973.0,566.0,8690.0,5.0,25.0,3.0,mGE6diUx2IE,Entertainment
EohHqWBlx9Y,jani331691,794.0,469.0,7907.0,5.0,15.0,21.0,MxnyorU_Fr8,Entertainment
dISPc5wvX64,MadeinchinaReview,1111.0,225.0,7846.0,5.0,1.0,17.0,L30ePvCD8Ls,Entertainment


In [0]:
youtube.write.format("parquet").saveAsTable("youtube.youtubedata")