<a href="https://colab.research.google.com/github/VictorL85/pyspark/blob/main/preparacao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark



In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler, PCA
from pyspark.ml.regression import LinearRegression

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
df_video = spark.read.parquet('/content/videos-comments-tratados.snappy.parquet', header=True, inferSchema=True)

In [None]:
df_video.show(n=1)

+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+---------+-------------+
|   Video ID|               Title|Published At|Keyword|Likes|Comments| Views|Interaction|Year|             Comment|Sentiment|Likes Comment|
+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+---------+-------------+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Let's not forget ...|        1|           95|
+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+---------+-------------+
only showing top 1 row



In [None]:
df_video = df_video.withColumn('Month', month(col('Published At')))

In [None]:
inde = StringIndexer(inputCol='Keyword', outputCol='keyword_index')
df_video = inde.fit(df_video).transform(df_video)

In [None]:
df_video = df_video.withColumn('Year', col('Year').cast('int'))

In [None]:
df_video.printSchema()

root
 |-- Video ID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Sentiment: integer (nullable = true)
 |-- Likes Comment: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- keyword_index: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- Features Normal: vector (nullable = true)
 |-- Features PCA: vector (nullable = true)



In [None]:
vetor = VectorAssembler(inputCols=['Likes','Views','Year','Month','keyword_index'], outputCol='features')
df_video = vetor.transform(df_video)

In [None]:
scaler = MinMaxScaler(inputCol='features', outputCol='Features Normal')
df_video = scaler.fit(df_video).transform(df_video)

In [None]:
pca = PCA(k=1, inputCol='Features Normal', outputCol='Features PCA')
df_video = pca.fit(df_video).transform(df_video)

In [None]:
train_df, test_df = df_video.randomSplit([0.8, 0.2], seed=42)
print(f"treino: {train_df.count()} e teste: {test_df.count()}")

treino: 14789 e teste: 3620


In [None]:
regre = LinearRegression(featuresCol='Features Normal', labelCol='Comments')
model = regre.fit(train_df)
avaliar = model.evaluate(test_df)
print(f"RMSE: {avaliar.rootMeanSquaredError} e R2: {avaliar.r2}")

RMSE: 25370.3336201662 e R2: 0.6602413154888491


In [None]:
df_video.write.mode('overwrite').parquet('videos-preparados-parquet')

In [None]:
spark.stop()