<a href="https://colab.research.google.com/github/VictorL85/pyspark/blob/main/tratamento.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark



In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
df_video = spark.read.csv('/content/videos-stats.csv', header=True, inferSchema=True)

In [None]:
df_video = df_video.fillna({'Likes': 0, 'Comments': 0, 'Views': 0})

In [None]:
df_comentario = spark.read.csv('/content/comments.csv', header=True, inferSchema=True)

In [None]:
print(f'quantidade de video: {df_video.count()}')
print(f'quantidade de comentarios: {df_comentario.count()}')

quantidade de video: 1869
quantidade de comentarios: 22555


In [None]:
df_video = df_video.dropna(subset=['Video ID'])
df_comentario = df_comentario.dropna(subset=['Video ID'])
print(f'quantidade de video sem Video ID nulo: {df_video.count()}')
print(f'quantidade de comentarios sem Video ID nulo: {df_comentario.count()}')

quantidade de video sem Video ID nulo: 1881
quantidade de comentarios sem Video ID nulo: 22555


In [None]:
df_video = df_video.dropDuplicates(['Video ID'])

In [None]:
df_comentario.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes Comment: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)



In [None]:
df_video.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: string (nullable = true)



In [None]:
df_video = df_video.\
withColumn('Likes', col('Likes').cast('int')).\
withColumn('Comments', col('Comments').cast('int')).\
withColumn('Views', col('Views').cast('int'))

In [None]:
df_comentario = df_comentario.\
withColumn('Likes', col('Likes').cast('int')).\
withColumn('Sentiment', col('Sentiment').cast('int')).\
withColumnRenamed('Likes','Likes Comment')

In [None]:
df_video = df_video.withColumn('Interaction', col('Likes') + col('Comments') + col('Views'))

In [None]:
df_video = df_video.withColumn('Published At', to_date(col('Published At')))

In [None]:
df_video = df_video.withColumn('Year', date_format(col('Published At'),'yyyy'))

In [None]:
df_join_video_comments = df_video.join(df_comentario,'Video ID','inner')

In [None]:
df_us_videos = spark.read.csv('/content/USvideos.csv', header=True, inferSchema=True)

In [None]:
df_join_video_usvideos = df_video.join(df_us_videos,df_video['Title'] == df_us_videos['Title'])

In [None]:
df_join_video_usvideos.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_e

In [None]:
df_video.select([count(when(col(c).isNull(),c)).alias(c)for c in df_video.columns]).show()

+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|_c0|Title|Video ID|Published At|Keyword|Likes|Comments|Views|Interaction|Year|
+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|  0|    0|       0|           0|      0|    0|       0|    0|          0|   0|
+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+



In [None]:
df_video.drop('_c0').write.mode('overwrite').option('header','true').parquet('videos-tratados-parquet')

In [None]:
df_join_video_comments.printSchema()

root
 |-- Video ID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes Comment: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)



In [None]:
df_join_video_comments = df_join_video_comments.drop('_c0')

In [None]:
df_join_video_comments.write.mode('overwrite').option('header','true').parquet('videos-comments-tratados-parquet')

In [None]:
spark.stop()