## Exploring Data

### Importing the necessary libraries and the dataset

In [11]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import plotly.express as px

# Khởi tạo SparkSession
spark = SparkSession.builder.appName('app').master('spark://bf7d5520f003:7077').getOrCreate()

# Đọc dữ liệu từ file CSV
data = spark.read.csv('/app/data.csv', header=True, inferSchema=True)

# Hiển thị một số dòng đầu của dữ liệu
data.show()

# Chuyển đổi DataFrame sang Pandas DataFrame
pandas_df = data.toPandas()

# Sử dụng Plotly Express để vẽ biểu đồ
fig = px.scatter(pandas_df, x='product_category_name', y='total_price')
fig.show()

# spark.stop()

+----------+---------------------+----------+---+-----------+-------------+-----------+-------------------+--------------------------+------------------+----------------+-------------+---------+-------+-------+-------+-----+----+-----------+------+-----------+---+-----------+-----------+---+-----------+-----------+---+-----------+-----------+
|product_id|product_category_name|month_year|qty|total_price|freight_price| unit_price|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_score|customers|weekday|weekend|holiday|month|year|          s|volume|     comp_1|ps1|        fp1|     comp_2|ps2|        fp2|     comp_3|ps3|        fp3|  lag_price|
+----------+---------------------+----------+---+-----------+-------------+-----------+-------------------+--------------------------+------------------+----------------+-------------+---------+-------+-------+-------+-----+----+-----------+------+-----------+---+-----------+-----------+---+-----------+------

### Checking the null values

In [None]:
# Kiểm tra giá trị null trong từng cột
null_counts = data.agg(*[F.sum(F.col(c).isNull().cast("int")).alias(c) for c in data.columns])

# Hiển thị số lượng giá trị null trong từng cột
null_counts.show()

### Descriptive statistics of the data

In [None]:
# Tính toán các thống kê mô tả
description = data.describe()

# Hiển thị kết quả
description.show()

### Exploring the distribution of the prices

In [None]:
fig = px.histogram(data, 
                   x='total_price', 
                   nbins=20, 
                   title='Distribution of Total Price')
fig.show()

### Exploring the distribution of the unit prices

In [None]:
fig = px.box(data, 
             y='unit_price', 
             title='Box Plot of Unit Price')
fig.show()

### Relationship between quantity and total prices

In [None]:
fig = px.scatter(data, 
                 x='qty', 
                 y='total_price', 
                 title='Quantity vs Total Price', trendline="ols")
fig.show()

### Average total prices by product categories

In [None]:
fig = px.bar(data, x='product_category_name', 
             y='total_price', 
             title='Average Total Price by Product Category')
fig.show()

 ### Distribution of total prices by weekday

In [None]:
fig = px.box(data, x='weekday', 
             y='total_price', 
             title='Box Plot of Total Price by Weekday')
fig.show()

### Distribution of total prices by holiday

In [None]:
fig = px.box(data, x='holiday', 
             y='total_price', 
             title='Box Plot of Total Price by Holiday')
fig.show()

### Correlation between the numerical features with each other

In [None]:
correlation_matrix = data.corr()
fig = go.Figure(go.Heatmap(x=correlation_matrix.columns, 
                           y=correlation_matrix.columns, 
                           z=correlation_matrix.values))
fig.update_layout(title='Correlation Heatmap of Numerical Features')
fig.show()

### Average competitor price difference by product category

In [None]:
data['comp_price_diff'] = data['unit_price'] - data['comp_1'] 

avg_price_diff_by_category = data.groupby('product_category_name')['comp_price_diff'].mean().reset_index()

fig = px.bar(avg_price_diff_by_category, 
             x='product_category_name', 
             y='comp_price_diff', 
             title='Average Competitor Price Difference by Product Category')
fig.update_layout(
    xaxis_title='Product Category',
    yaxis_title='Average Competitor Price Difference'
)
fig.show()

## Training Machine Learning model for this project

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

X = data[['qty', 'unit_price', 'comp_1', 
          'product_score', 'comp_price_diff']]
y = data['total_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=42)

# Train a linear regression model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

### Predicting retail prices

In [None]:
y_pred = model.predict(X_test)

fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', 
                         marker=dict(color='blue'), 
                         name='Predicted vs. Actual Retail Price'))
fig.add_trace(go.Scatter(x=[min(y_test), max(y_test)], y=[min(y_test), max(y_test)], 
                         mode='lines', 
                         marker=dict(color='red'), 
                         name='Ideal Prediction'))
fig.update_layout(
    title='Predicted vs. Actual Retail Price',
    xaxis_title='Actual Retail Price',
    yaxis_title='Predicted Retail Price'
)
fig.show()

In [None]:
spark.stop()