<a href="https://colab.research.google.com/github/Samgomes2510/dashborad-ecommerce/blob/main/Tratamento.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [2]:
# Tratamento de dados com PySpark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, year, to_date

In [3]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, regexp_extract

In [6]:
from google.colab import files
uploaded = files.upload()

Saving USvideos.csv to USvideos.csv
Saving videos-stats.csv to videos-stats.csv
Saving comments.csv to comments.csv


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Inicializa Spark
spark = SparkSession.builder.getOrCreate()

In [32]:
# 1. Ler videos-stats.csv
df_video = spark.read.csv("videos-stats.csv", header=True, inferSchema=True)

In [33]:
# 2. Substituir nulos
df_video = df_video.na.fill({'Likes': 0, 'Comments': 0, 'Views': 0})

In [34]:
# Mostrar as primeiras 5 linhas (padrão) do df_video
df_video.show()


+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|      Views|
+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0|   135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|  1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|  1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0|   922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|  1855644.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0|   943119.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|  5937790.0|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935.0| 12605.0|  4782514.0|
|  8|15 Em

In [11]:
# 3. Ler comments.csv
df_comentario = spark.read.csv("comments.csv", header=True, inferSchema=True)


In [30]:
# Mostrar as primeiras 10 linhas do df_comentario
df_comentario.show(10)

+---+-----------+--------------------+-------------+---------+
|_c0|   Video ID|             Comment|Likes Comment|Sentiment|
+---+-----------+--------------------+-------------+---------+
|  0|wAZZ-UWGVHI|Let's not forget ...|           95|        1|
|  1|wAZZ-UWGVHI|Here in NZ 50% of...|           19|        0|
|  2|wAZZ-UWGVHI|I will forever ac...|          161|        2|
|  3|wAZZ-UWGVHI|Whenever I go to ...|            8|        0|
|  4|wAZZ-UWGVHI|Apple Pay is so c...|           34|        2|
|  5|wAZZ-UWGVHI|We’ve been houndi...|            8|        1|
|  6|wAZZ-UWGVHI|We only got Apple...|           29|        2|
|  7|wAZZ-UWGVHI|For now, I need b...|            7|        1|
|  8|wAZZ-UWGVHI|In the United Sta...|            2|        2|
|  9|wAZZ-UWGVHI|In Cambodia, we h...|           28|        1|
+---+-----------+--------------------+-------------+---------+
only showing top 10 rows



In [12]:
# 4. Contagem inicial
print("Registros df_video:", df_video.count())
print("Registros df_comentario:", df_comentario.count())

Registros df_video: 1881
Registros df_comentario: 30036


In [13]:
# 5. Remover registros com Video ID nulo
df_video = df_video.filter(col('Video ID').isNotNull())
df_comentario = df_comentario.filter(col('Video ID').isNotNull())
print("Após remover nulos - df_video:", df_video.count())
print("Após remover nulos - df_comentario:", df_comentario.count())


Após remover nulos - df_video: 1881
Após remover nulos - df_comentario: 22555


In [14]:
# 6. Remover duplicados em Video ID no df_video
df_video = df_video.dropDuplicates(['Video ID'])


In [15]:
# 7. Converter campos para int
df_video = df_video.withColumn('Likes', col('Likes').cast('int')) \
                   .withColumn('Comments', col('Comments').cast('int')) \
                   .withColumn('Views', col('Views').cast('int'))

In [16]:
# 8. Converter e renomear no df_comentario
df_comentario = df_comentario.withColumn('Likes', col('Likes').cast('int')) \
                             .withColumnRenamed('Likes', 'Likes Comment') \
                             .withColumn('Sentiment', col('Sentiment').cast('int'))

In [17]:
# 9. Criar campo Interaction
df_video = df_video.withColumn('Interaction', col('Likes') + col('Comments') + col('Views'))

In [18]:
# 10. Converter Published At para date
df_video = df_video.withColumn('Published At', to_date(col('Published At')))

In [19]:
# 11. Criar campo Year
df_video = df_video.withColumn('Year', year(col('Published At')))

In [20]:
# 12. Mesclar df_video e df_comentario por Video ID
df_join_video_comments = df_video.join(df_comentario, on='Video ID', how='left')

In [35]:
# 13. Ler USvideos.csv
df_us_videos = spark.read.csv("USvideos.csv", header=True, inferSchema=True)

In [36]:
# Mostrar as primeiras 20 linhas do df_us_videos
df_us_videos.show(20)

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           Fal

In [22]:
# 14. Mesclar df_video e df_us_videos por Title
df_join_video_usvideos = df_video.join(df_us_videos, on='Title', how='left')

In [37]:
# 15. Contagem de nulos em df_video
for column in df_video.columns:
    nulos = df_video.filter(col(column).isNull()).count()
    print(f"Coluna {column}: {nulos} nulos")


Coluna _c0: 0 nulos
Coluna Title: 0 nulos
Coluna Video ID: 0 nulos
Coluna Published At: 0 nulos
Coluna Keyword: 0 nulos
Coluna Likes: 0 nulos
Coluna Comments: 0 nulos
Coluna Views: 0 nulos


In [24]:
# 16. Remover _c0 e salvar df_video
if '_c0' in df_video.columns:
    df_video = df_video.drop('_c0')
df_video.write.mode('overwrite').option('header', 'true').parquet('drive/MyDrive/Colab Notbook/spark/videos-tratados-parquet')

In [25]:
# 16. Remover _c0 e salvar df_video
if '_c0' in df_video.columns:
    df_video = df_video.drop('_c0')
df_video.write.mode('overwrite').option('header', 'true').parquet('videos-tratados-parquet')

In [26]:
# 17. Remover _c0 e salvar df_join_video_comments
if '_c0' in df_join_video_comments.columns:
    df_join_video_comments = df_join_video_comments.drop('_c0')
df_join_video_comments.write.mode('overwrite').option('header', 'true').parquet('drive/MyDrive/Colab Notbooks/spark/videos-comments-tratados-parquet')

In [27]:
# 17. Remover _c0 e salvar df_join_video_comments
if '_c0' in df_join_video_comments.columns:
    df_join_video_comments = df_join_video_comments.drop('_c0')
df_join_video_comments.write.mode('overwrite').option('header', 'true').parquet('videos-comments-tratados-parquet')

In [38]:
# Visualizar os schemas
df_video.printSchema()
df_comentario.printSchema()
df_us_videos.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: double (nullable = false)
 |-- Comments: double (nullable = false)
 |-- Views: double (nullable = false)

root
 |-- _c0: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes Comment: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: 

In [39]:
spark.stop()