# Exploratory Data Analysis

##### Importing

In [15]:
import pandas as pd
import numpy as np
import plotly.express as px
import nbformat
import plotly.graph_objects as go

print(nbformat.__version__)

5.10.4


### Exploring Orders_clean

##### Importing and Checking for null value

In [16]:
orders = pd.read_csv("orders_clean.csv")

orders.isnull().sum()

id_x          0
user_id       0
product_id    0
quantity      0
category      0
price         0
total         0
dtype: int64

In [17]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18760 entries, 0 to 18759
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id_x        18760 non-null  int64  
 1   user_id     18760 non-null  int64  
 2   product_id  18760 non-null  int64  
 3   quantity    18760 non-null  int64  
 4   category    18760 non-null  object 
 5   price       18760 non-null  float64
 6   total       18760 non-null  float64
dtypes: float64(2), int64(4), object(1)
memory usage: 1.0+ MB


#### Plotting and Analysis

In [18]:
#Finding Unique values
for n in orders.columns:
    print(str(n) + str(orders[n].unique().shape))

id_x(18760,)
user_id(1746,)
product_id(200,)
quantity(77,)
category(4,)
price(170,)
total(1666,)


##### Plotting Discrete Values

In [19]:
#Plotting Count of orders by quantity

# Count the occurrences of each category
category_counts = orders["category"].value_counts().reset_index()
category_counts.columns = ["category", "count"]

# Plot bar chart using Plotly
fig = px.bar(category_counts, x="category", y="count", title="Category Distribution in Orders",
             labels={"category": "Product Category", "count": "Number of Orders"}, color="category")

# Show the plot
fig.show()

In [20]:
# Plotting sum of quantity sold by categorys
# Group by 'category' and calculate the sum of 'quantity'
category_quantity_sum = orders.groupby("category")["quantity"].sum().reset_index()

# Plot bar chart using Plotly
fig = px.bar(category_quantity_sum, x="category", y="quantity", 
             title="Sum of Quantity by Product Category",
             labels={"category": "Product Category", "quantity": "Sum of Quantity"},color= "category")

# Show the plot
fig.show()

In [21]:
#Plotting total sales by product category
# Group by 'category' and calculate the sum of 'total'
category_total_sum = orders.groupby("category")["total"].sum().reset_index()

# Plot bar chart using Plotly
fig = px.bar(category_total_sum, x="category", y="total", 
             title="Sum of Total by Product Category",
             labels={"category": "Product Category", "total": "Sum of Total"},color = "category")

# Show the plot
fig.show()

##### Plotting Numerical Values

In [22]:
# List of numerical columns to plot
numerical_cols = ["quantity", "price", "total"]

# Create a histogram for each numerical column
fig = go.Figure()

for col in numerical_cols:
    fig.add_trace(go.Histogram(x=orders[col], name=col, opacity=0.7))

# Update layout
fig.update_layout(
    title="Distribution of Numerical Columns",
    xaxis_title="Value",
    yaxis_title="Count",
    barmode="overlay",  # Overlapping histograms
    template="plotly_dark"  # Optional: Dark theme
)

# Show the plot
fig.show()

In [23]:
#Boxplot
for col in numerical_cols:
    fig = px.box(orders, y=col, title=f"Box Plot of {col}")
    fig.show()

#### Relationship Analysis

In [24]:
# Compute correlation matrix
correlation_matrix = orders[numerical_cols].corr()

# Plot heatmap using Plotly
fig = px.imshow(correlation_matrix,
                text_auto=True,  # Shows correlation values inside heatmap
                labels=dict(color="Correlation"),
                title="Correlation Heatmap of Numerical Columns",
                color_continuous_scale=[(0, "gray"), (1, "green")],
                template="plotly_dark")  # Choose a color scheme

# Show plot
fig.show()

In [25]:
#Plotting Product_id and sum of Quantity sold
# Group by 'product_id' and calculate the sum of 'quantity'
product_quantity_sum = orders.groupby("product_id")["quantity"].sum().reset_index()

# Plot bar chart using Plotly
fig = px.bar(product_quantity_sum, x="product_id", y="quantity", 
             title="Sum of Quantity by Product ID",
             labels={"product_id": "Product ID", "quantity": "Sum of Quantity"})

# Show the plot
fig.show()

In [26]:
# Box plot of total by each category
# Create a box plot using Plotly
fig = px.box(orders, x="category", y="total", 
             title="Box Plot of Total by Product Category",
             labels={"category": "Product Category", "total": "Total"},color = "category")

# Show the plot
fig.show()

In [27]:
# Group by 'category' and calculate the average quantity
average_quantity_by_category = orders.groupby("category")["quantity"].mean().reset_index()

# Rename columns for clarity
average_quantity_by_category.columns = ["category", "average_quantity"]

# Display the result
print(average_quantity_by_category)

    category  average_quantity
0  Doohickey          3.618712
1     Gadget          3.819599
2      Gizmo          3.630226
3     Widget          3.738392


### Exploring Products_cleans

In [28]:
products = pd.read_csv("products_clean.csv")
products

Unnamed: 0,id,category,price,rating,title,vendor
0,1,Gizmo,29.4633,4.6,Rustic Paper Wallet,"Swaniawski, Casper and Hilll"
1,2,Doohickey,70.0799,0.0,Small Marble Shoes,Balistreri-Ankunding
2,3,Doohickey,35.3887,4.0,Synergistic Granite Chair,"Murray, Watsica and Wunsch"
3,4,Doohickey,73.9918,3.0,Enormous Aluminum Shirt,Regan Bradtke and Sons
4,5,Gadget,82.7451,4.0,Enormous Marble Wallet,"Price, Schultz and Daniel"
...,...,...,...,...,...,...
195,196,Widget,46.7641,0.0,Heavy-Duty Linen Toucan,Balistreri-Muller
196,197,Gizmo,46.7641,4.6,Aerodynamic Concrete Lamp,Erika Volkman Group
197,198,Gizmo,46.7641,4.1,Enormous Copper Shirt,"Considine, Schamberger and Schiller"
198,199,Widget,76.9533,3.6,Mediocre Leather Coat,"Gulgowski, Grimes and Mayer"


In [29]:
for n in products.columns:
    print(str(n) + str("  ")+ str(products[n].nunique()))

id  200
category  4
price  170
rating  23
title  199
vendor  200


In [30]:
# Create a box plot
fig = px.box(products, x="category", y="price", title="Price Distribution by Category",
             labels={"price": "Price", "category": "Category"}, 
             color="category")

# Show the plot
fig.show()

In [31]:
fig = px.histogram(products, x="price", color="category", barmode="overlay",
                   title="Price Distribution by Category",
                   labels={"price": "Price", "category": "Category"},
                   nbins=40)

fig.show()

In [32]:
# Create a histogram for rating distribution per category
fig = px.histogram(products, x="rating", color="category", barmode="overlay",
                   title="Rating Distribution by Category",
                   labels={"rating": "Rating", "category": "Category"},
                   nbins=10)

fig.show()

In [56]:
# Create a scatter plot for price vs. rating
fig = px.scatter(products, x="rating", y="price", color="category",
                 hover_data=["title", "vendor"],
                 title="Scatter Plot of Rating vs. Price",
                 labels={"rating": "Rating", "price": "Price", "category": "Category"},
                 size_max=10)

# Show the plot
fig.show()

### Exploring reviews_clean

In [None]:
reviews = pd.read_csv("reviews_clean.csv")
print(reviews.info())
print(reviews.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1112 entries, 0 to 1111
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             1112 non-null   int64 
 1   reviewer       1112 non-null   object
 2   product_id     1112 non-null   int64 
 3   rating         1112 non-null   int64 
 4   body           1112 non-null   object
 5   review_length  1112 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 52.3+ KB
None
id               0
reviewer         0
product_id       0
rating           0
body             0
review_length    0
dtype: int64


In [47]:
#getting products category
prodcate = products[["id","category"]]

reviewstemp = reviews.merge(prodcate, left_on="product_id", right_on="id", how="inner")
reviewsmerge = reviewstemp.drop("id_y", axis= 1)

In [None]:
rating_pivot_table = reviewsmerge.pivot_table(values="rating", index="product_id", aggfunc="mean")
print(rating_pivot_table)
rating_pivot_table.isnull().sum()

rating_pivot_table["products"]




              rating
product_id          
1           4.625000
3           4.000000
4           3.000000
5           4.000000
6           3.800000
...              ...
195         5.000000
197         4.666667
198         4.142857
199         3.666667
200         4.000000

[176 rows x 1 columns]


rating    0
dtype: int64

Unnamed: 0,id,category,price,rating,title,vendor
0,1,Gizmo,29.4633,4.6,Rustic Paper Wallet,"Swaniawski, Casper and Hilll"
1,2,Doohickey,70.0799,0.0,Small Marble Shoes,Balistreri-Ankunding
2,3,Doohickey,35.3887,4.0,Synergistic Granite Chair,"Murray, Watsica and Wunsch"
3,4,Doohickey,73.9918,3.0,Enormous Aluminum Shirt,Regan Bradtke and Sons
4,5,Gadget,82.7451,4.0,Enormous Marble Wallet,"Price, Schultz and Daniel"
...,...,...,...,...,...,...
195,196,Widget,46.7641,0.0,Heavy-Duty Linen Toucan,Balistreri-Muller
196,197,Gizmo,46.7641,4.6,Aerodynamic Concrete Lamp,Erika Volkman Group
197,198,Gizmo,46.7641,4.1,Enormous Copper Shirt,"Considine, Schamberger and Schiller"
198,199,Widget,76.9533,3.6,Mediocre Leather Coat,"Gulgowski, Grimes and Mayer"


In [None]:
orders