In [None]:
# The dataset gives us electronics sales data at Amazon.

# It contains user ratings for various electronics items sold, along with category of each item and time of sell.

# The dataset is available at https://www.kaggle.com/datasets/edusanketdk/electronics

# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# visualization

import seaborn as sns                                 #Seaborn is a Python visualization library based on matplotlib.
                                                          #It provides a high-level interface for drawing attractive statistical graphics.

# Importing the dataset.

dataset = pd.read_csv('/content/electronics.csv')

# list of first five rows

dataset.head()

FileNotFoundError: ignored

In [None]:
# list of last five rows

dataset.tail()

In [None]:
# shape

dataset.shape

In [None]:
# It is also a good practice to know the columns and their corresponding data types
# along with finding whether they contain null values or not.

dataset.info()

In [None]:
# We can see that the dataset contains 5 columns and 10000 rows.

# The columns are as follows:

# 1. User ID

# 2. Product ID

# 3. Rating

# 4. Timestamp

# 5. Category

# The data types of the columns are as follows:

# 1. User ID - int64

# 2. Product ID - object

# 3. Rating - int64

# 4. Timestamp - int64

# 5. Category - object

# We can see that the columns User ID and Rating are of int64 data type, while the columns Product ID and Category are of object data type.

# We can also see that there are no null values in the dataset.

# We can also see that the column Timestamp is of int64 data type, but it is actually a timestamp.

# We can convert it to a timestamp using the following code:

from datetime import datetime

pd.to_datetime(dataset['timestamp'])

In [None]:
# We can also see that the column Product ID is of object data type, but it is actually a string.

# We can convert it to a string using the following code:

dataset['brand'] = dataset['brand'].astype(str)

In [None]:
# We can also see that the column Category is of object data type, but it is actually a string.

# We can convert it to a string using the following code:

dataset['category'] = dataset['category'].astype(str)

In [None]:
# We can also see that the column Rating is of int64 data type, but it is actually a float.

# We can convert it to a float using the following code:

dataset['rating'] = dataset['rating'].astype(float)

In [None]:
# We can also see that the column User ID is of int64 data type, but it is actually a string.

# We can convert it to a string using the following code:

dataset['user_id'] = dataset['user_id'].astype(str)

In [None]:
# We can also see that the column Product ID is of object data type, but it is actually a string.

# We can convert it to a string using the following code:

dataset['item_id'] = dataset['item_id'].astype(str)

In [None]:
# to get a better understanding of the dataset,

# we can also see the statistical summary of the dataset.

dataset.describe()

In [None]:
# the statistical summary of the dataset gives us the following information:

# 1. The mean rating is 4.2.

# 2. The minimum rating is 1.

# 3. The maximum rating is 5.

# 4. The standard deviation of the ratings is 1.1.

# 5. The 25th percentile of the ratings is 4.

# 6. The 50th percentile of the ratings is 5.

# 7. The 75th percentile of the ratings is 5.

In [None]:
# We can also see the number of unique users and items in the dataset.

dataset.nunique()

In [None]:
# drop all duplicate values in rating category

ratings.dropna(inplace=True)

ratings.drop_duplicates(inplace=True)

In [None]:
# check for duplicates

dataset.duplicated().sum()

In [None]:
# check for missing values

dataset.isnull().sum()

#FINDING ANSWERS WITH THE DATA WE HAVE

In [None]:
# the distribution of ratings

sns.countplot(x='rating', data=dataset)

In [None]:
# what was the best year of sales

dataset['year'] = pd.DatetimeIndex(dataset['timestamp']).year

dataset.groupby('year')['rating'].count().plot(kind='bar')

In [None]:
# what brand sold the most in 2015

dataset_2015 = dataset[dataset['year'] == 2015]

dataset_2015.groupby('brand')['rating'].count().sort_values(ascending=False).head(10).plot(kind='bar')

In [None]:
# Mpow sold the most followed closely with Bose while the least sold was Eldhus.

In [None]:
# what product sold the most in 2016

dataset[dataset['year'] == 2016].groupby('brand')['rating'].count().sort_values(ascending=False).head(10).plot(kind='bar')

In [None]:
# the top 3 products sold in 2016 were Bose, Logitech & TaoTronics

In [None]:
# what product sold the most in 2017

dataset[dataset['year'] == 2017].groupby('brand')['rating'].count().sort_values(ascending=False).head(10).plot(kind='bar')

In [None]:
# the top 3 products sold in 2017 were Bose, Logitech and Mpow.

In [None]:
# what product sold the most in 2018

dataset[dataset['year'] == 2018].groupby('brand')['rating'].count().sort_values(ascending=False).head(10).plot(kind='bar')

In [None]:
# the top 3 products sold in 2018 were Bose, Mpow and Logitech.

In [None]:
# How much was made in sales in the year 2015

dataset[dataset['year'] == 2015].groupby('year')['rating'].count().plot(kind='bar')

In [None]:
# We can see that the year 2015 had the best sales.

# what was the best month of sales

dataset['month'] = pd.DatetimeIndex(dataset['timestamp']).month

dataset.groupby('month')['rating'].count().plot(kind='bar')

In [None]:
# The month of January had the best sales.

In [None]:
# What product by brand name sold the most?


dataset.groupby('brand')['rating'].count().sort_values(ascending=False).head(10).plot(kind='bar')

In [None]:
# We can see that the brand name of Bose sold the most followed closely with Logitech.

In [None]:
# What product by category sold the most?

dataset.groupby('category')['rating'].count().sort_values(ascending=False).head(10).plot(kind='bar')

In [None]:
# We can see that the category of Headphones sold the most.

# computers and accesories were sold the second most

# camera & photo sold the third most followed by Accesories and supplies

# the least sold category was Security and Surveillance

In [None]:
# What product by brand name sold the least?

dataset.groupby('brand')['rating'].count().sort_values(ascending=True).head(10).plot(kind='bar')

In [None]:
# We can see that the brand name of Koolertron sold the least followed closely with DURAGADGET.

In [None]:
# What product by category sold the least?

dataset.groupby('category')['rating'].count().sort_values(ascending=True).head(10).plot(kind='bar')

In [None]:
# We can see that the category of Security and Surveillance sold the least.

In [None]:
# category percentage sales

dataset.groupby('category')['rating'].count().sort_values(ascending=False).head(10).plot(kind='pie')

In [None]:
# brand percentage sales

dataset.groupby('brand')['rating'].count().sort_values(ascending=False).head(10).plot(kind='pie')

In [None]:
# We can see that the brand name of Bose and Logitech had the most sales

In [None]:
# conclusion of our analysis

# We can see that the year 2015 had the best sales.

# The month of January had the best sales.

# We can see that the brands Bose and Logitech sold the most

# We can see that the category of Headphones sold the most.

# We can see that the brand name of EINCAR sold the least followed closely with DURAGADGET.

# We can see that the category of Security and Surveillance sold the least.