In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center> <h1> <img src="https://64.media.tumblr.com/6699ee089307fc877eeec217a7eb3f38/tumblr_mwu4ysVppy1svwlszo1_540.gifv" alt="drawing" style="height:50px;"/>EDA of Adidas and Nike Products<img src="https://i.gifer.com/RS9m.gif" alt="drawing" style="height:50px;"/></h1> </center>


---

<p style="font-family:verdana; text-align:center;"> This is an exploratory notebook looking at the performances of each brand, what products are popular and how often people visit the website. </p>

<div style="background-color:black"> <center> <h2 style="color:white"> 1) Load in Data 💿 </h2> </center> </div>


In [None]:
file = "../input/adidas-vs-nike/Adidas Vs Nike.csv"
data = pd.read_csv(file)
data.head()

<h3 style="font-family:verdana; border:2px solid Black; color:black; text-align:center;">1. Summarise Data </h3>

<div align="center"><p style="font-family:verdana; text-align:center;"> There are 3268 products, each having 10 attributes which are:</p>
<ol style="font-family:verdana; background-color: #A9A9A9; color:white; display:table; margin:1px; text-align:left; border:2px solid black; border-radius: 10px; box-shadow: 2px 2px 4px #000000;">
    <li style="padding: 2px 5px; "> Product Name </li>
    <li style="padding: 2px 5px; "> Product ID </li>
<li style="padding: 2px 5px; "> Listing Price</li>
<li style="padding: 2px 5px; "> Sale Price</li>
<li style="padding: 2px 5px; "> Discount</li>
<li style="padding: 2px 5px; "> Brand</li>
<li style="padding: 2px 5px; "> Description</li>
<li style="padding: 2px 5px; "> Rating</li>
<li style="padding: 2px 5px; "> Reviews</li>
<li style="padding: 2px 5px; "> Last Visited</li>
</ol>
    </div>

In [None]:
data.shape

<div style="background-color:black"> <center> <h2 style="color:white"> 2) Data Cleaning 🧹 </h2> </center> </div>

<h3 style="font-family:verdana; border:2px solid Black; color:black; text-align:center;">1. Check for null values</h3>

In [None]:
print(data.isnull().sum())

<p style="font-family:verdana; text-align:center;">The data is fairly clean with the exception of 3 products with missing descriptions. We will change these null values to empty strings.</p>

In [None]:
data.fillna("", inplace=True)
print(data.isnull().sum())

<h3 style="font-family:verdana; border:2px solid Black; color:black; text-align:center;">2. Change Last Visited </h3>
<p style="font-family:verdana; text-align:center;"> Last visited is a String. It should be changed to a Date time type for easier analysis and visualisation. </p>

In [None]:
sample_date = data['Last Visited'][0]
print("The date is in the format: " + sample_date)
print("The type for the Last Visited Attribute is: " + str(type(sample_date)))

In [None]:
from datetime import datetime

# Change T to " "
dates = [date.replace("T", " ") for date in data['Last Visited']]
dates = [datetime.fromisoformat(date) for date in dates]
data['Last Visited'] = dates
print("The type for the Last Visited Attribute is: " + str(type(dates[0])))

<h3 style="font-family:verdana; border:2px solid Black; color:black; text-align:center;">3. Remove Redundancies </h3>
<p style="font-family:verdana; text-align:center;"> It has been found that there is a redundant brand called <b>"Adidas Adidas ORIGINALS"</b> with 1 product in it. There is already a brand called <b>"Adidas Originals"</b> with several products in it. Therefore, we should be change this brand to belong to <b>"Adidas ORIGINALS"</b>.</p>

In [None]:
for product in data.values:
    if product[5] == "Adidas Adidas ORIGINALS":
        print("Redundant product is: {}\nBrand: {} \nDescription: {}".format(product[0], product[5], product[6]))

In [None]:
data.replace("Adidas Adidas ORIGINALS", "Adidas ORIGINALS", inplace=True)

<p style="font-family:verdana; text-align:center;"> There are redundacies with the Date as the difference between the oldest and newest Last Visited is only half an hour. Therefore, we shall only look at the minutes and seconds of the visits.</p>

In [None]:
min_date = min(data['Last Visited'])
max_date = max(data['Last Visited'])

date_range = max_date - min_date
print("Last Visited Range is: " + str(date_range))

In [None]:
# Obtain only the minutes portion
new_dates = [d.minute for d in data['Last Visited']]
data['Last Visited'] = new_dates

<h3 style="font-family:verdana; border:2px solid Black; color:black; text-align:center;">4. Remove Duplicates </h3>
<p style="font-family:verdana; text-align:center;"> The dataset appears to have <i style="color:blue";> 7 </i> exact duplicates of products. This may be an artifact from the data collection step. We will remove these. </p>

In [None]:
from collections import Counter
duplicates = data.duplicated()
print(Counter(duplicates))

In [None]:
data.drop_duplicates(keep='last', inplace=True)
print("Duplicates have been dropped.")
print(Counter(data.duplicated()))

<p style="font-family:verdana; text-align:center;"> There are still products that have duplicate names. We'll decide if we want to merge them or delete them.
</p>

In [None]:
duplicate_names = data.duplicated(subset=['Product Name'])
print(Counter(duplicate_names))

duplicate_reviews = data.duplicated(subset=['Product Name', 'Reviews', 'Rating'])
print(Counter(duplicate_reviews))

<p style="font-family:verdana; text-align:center;"> There's a lot of products which have the same product name, <i style="color:blue"> 1730 </i> to be exact. However, only <i style="color:blue"> 130 </i> of these duplicates have the same reviews and ratings. Because these products have different reviews and ratings, we shall keep the latest occurrence based on highest reviews (we can't decrease reviews over time!).
</p>

In [None]:
data = data.sort_values('Reviews').drop_duplicates('Product Name', keep='last')
print("Dropped all duplicates!")
print(Counter(data.duplicated(subset=['Product Name'])))

<div style="background-color:black"> <center> <h2 style="color:white"> 3) Exploratory Data Analysis 🔍 </h2> </center> </div>

<h3 style="text-align: center;"> 1. How many products per brand? </h3>
<p style="font-family:verdana; text-align:center;"> Since this data set is focused on Adidas and Nike, we will use these keywords to filter on the Brand attribute.</p>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Nike products count
nike_products = [p for p in data['Brand'] if "nike" in p.lower()]
num_nike = len(nike_products)

# Adidas products count
adidas_products = [p for p in data['Brand'] if "adidas" in p.lower()]
num_adidas = len(adidas_products)

# Customise plots
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))
fig.suptitle("Number of Products by Adidas and Nike",fontsize=20)
y = [num_nike, num_adidas]
colors=['grey','silver']

# Bar chart
ax1.bar(x=['Nike','Adidas'], height=y, color=colors)
ax1.set_title("Number of Products by Brand")
ax1.set_xlabel("Brand")
ax1.text(x=-0.07, y=390,s=num_nike, fontsize=14)
ax1.text(x=0.88, y=1140,s=num_adidas, fontsize=14)
ax1.set_ylabel("Number of Products")

# Pie Chart
ax2.set_title("Proportion of Products by Brand")
ax2.pie(labels=['Nike','Adidas'], x=y, autopct='%.2f%%', colors=colors )
fig.tight_layout(pad=3.0)

<p style="font-family:verdana; text-align:center;"> It can be seen that there is a much higher distribution of products belonging to Adidas. One of the reasons why is that Adidas has a large proportion of sub-brands or collaborations while Nike falls under the one brand. </p>

In [None]:
# Only include adidas brands
adidas_brands = data.drop(data[data['Brand'] == "Nike"].index)


# Plot bar
plt.figure(figsize=(10,5))
plt.title("Products in Adidas Sub-brands")
plt.ylabel("Adidas Brand")
sns.set_palette("gray")
sns.countplot(data=adidas_brands, y="Brand")

<h3 style="text-align: center;"> 2. Which brand is discounting more?</h3>
<p style="font-family:verdana; text-align:center;">There will be bias because there is a lot more Adidas products available than Nike products.</p>

<h3 style="font-family:verdana; text-align:center; background-color:silver"> Nike </h3>

<p style="font-family:verdana; text-align:center;"> Since the average discount is <i style="color:blue;">0%</i>, this can indicate that Nike's products rarely go on sale. The listing price averages around <i style="color:blue;">$4452.30</i> </p>

In [None]:
nike_avg_listing = nike_brand['Listing Price'].mean(axis=0)
nike_avg_discount = nike_brand['Discount'].mean(axis=0)

print("Nike's Average Listing Price is: ${0:.2f}".format(nike_avg_listing))
print("Nike's Average Discount: {0:.2f}%".format(nike_avg_discount))

<h3 style="font-family:verdana; text-align:center; background-color:silver"> Adidas </h3>

<p style="font-family:verdana; text-align:center;"> Adidas's products have a much larger discount percentage than Nike's <i style="color:blue;">(33.21% vs 0.00%)</i>. However, it should be noted that they have more products AND their listing prices, <i style="color:blue;"> $8038.54, </i> are much higher. Generally, if the goods are expensive, then it is more likely to be discounted.</p>

In [None]:
adidas_avg_listing = adidas_brand['Listing Price'].mean(axis=0)
adidas_avg_discount = adidas_brand['Discount'].mean(axis=0)

print("Adidas's Average Listing Price is: ${0:.2f}".format(adidas_avg_listing))
print("Adidas's Average Discount: {0:.2f}%".format(adidas_avg_discount))

<h3 style="text-align: center;"> 3. Which brand has better reviews and ratings? </h3>

<p style="font-family:verdana; text-align:center;"> Let's look at the average ratings and reviews for Nike and Adidas. Does one consistently perform better than the other? </p>

<h3 style="font-family:verdana; text-align:center; background-color:silver"> Nike </h3>

<p style="font-family:verdana; text-align:center;"> From the <code> .describe()</code> function, we can see that for <b>Ratings</b>, standard deviation is quite small, <i style="color:blue;">2.08</i> and mean is <i style="color:blue;">2.94</i>. This is a poor satisfaction rating for their customers. <br /> <br /> For the <b>Reviews</b>, the standard deviation is very large, at a value of <i style="color:blue;">18.56</i>. This is because products may be extremely popular, or new which drastically affects the number of people reviewing them. The mean is <i style="color:blue;">8.36</i> but this number may not be reliable.</p>


In [None]:
nike_brand[['Rating', 'Reviews']].describe()

<h3 style="font-family:verdana; text-align:center; background-color:silver"> Adidas </h3>

<p style="font-family:verdana; text-align:center;"> From the <code> .describe()</code> function, we can see that for <b>Ratings</b>, standard deviation is again, small at <i style="color:blue;">1.14</i> and mean is <i style="color:blue;">3.38</i>. This is much higher than Nike's products which shows better satisfaction. <br /> <br /> The <b>Reviews</b> standard deviation is even larger than Nike's, at a value of <i style="color:blue;">28.04</i>. Note that Adidas has a wider product range than Nike so number of reviews will fluctuate more due to more products. Despite this fluctuation, mean is <i style="color:blue;">60.68</i>, much greater than Nike's <i style="color:red;">8.36</i>. It appears more people rate Adidas's products than Nike.</p>


In [None]:
adidas_brand[['Rating', 'Reviews']].describe()

<h3 style="text-align: center;"> 4. How often are people visiting these products? </h3>
<p style="font-family:verdana; text-align:center;"> Is there a trend in the date and time these products were last visited? We will arrange the <b> Last Visited </b> from oldest the newest and count how many records are in each minute time-slot for both Adidas and Nike.</p>

In [None]:
from datetime import datetime, timedelta
from collections import Counter

plt.figure(figsize=(10,6))

nike_counter = Counter(nike_brand['Last Visited'])
nike_time_df = pd.DataFrame.from_dict(nike_counter, orient='index', columns=['count'])

adidas_counter = Counter(adidas_brand['Last Visited'])
adidas_time_df = pd.DataFrame.from_dict(adidas_counter, orient='index', columns=['count'])

sns.set_style("whitegrid")
sns.lineplot(y="count", x=nike_time_df.index, data=nike_time_df, label="Nike", color="silver")
plt.ylim(0,800)
plt.xlabel("Minute")
plt.title("Product Visits per Minute for Nike and Adidas")
sns.lineplot(y="count", x=adidas_time_df.index, data=adidas_time_df, label="Adidas")

<p style="font-family:verdana; text-align:center;"> There are more visits for Adidas in a shorter time frame than Nike. Numbers peak to more than <i style="color:blue;">700</i> within <i style="color:blue;">5 </i>minutes while Nike fluctuates about <i style="color:red;">50</i> visits across <i style="color:red;">20</i> minutes. This doesn't tell us much about the products but the spikes from Adidas can be indicative of their many products available.</p>

<h3 style="text-align: center;"> 5. What is the best product per brand? </h3>

<p style="font-family:verdana; text-align:center;"> Which product from each brand is considered the best? We will look at both the ratings and reviews for the products. </p>

<h3 style="font-family:verdana; text-align:center; background-color:silver"> Nike </h3>
<p style="font-family:verdana; text-align:center;"> Nike has <i style="color:blue;">120</i> products which have a <b>Rating</b> of <i style="color:blue;">5</i> stars. </p>

In [None]:
nike_max_rating = nike_brand['Rating'].max()
nike_max_rated_product = nike_brand[nike_brand['Rating']==nike_max_rating]
print("Nike's Max Rating: {}\n".format(nike_max_rating))

print("Number of Top Rated Products: {}".format(len(nike_max_rated_product)))
print("Nike's Top Products: \n")

for product in nike_max_rated_product['Product Name']:
    print(product)

<p style="font-family:verdana; text-align:center;"> In order to determine what's most popular, we'll use the number of <b>Reviews</b> to give a better ranking of these top rated products. </p>

In [None]:
# Select the reviews and product name
nike_top_reviews = nike_max_rated_product[['Product Name', 'Reviews']]
print(nike_top_reviews)

In [None]:
# Sort by highest to lowest reviews
nike_top_reviews.sort_values(by=['Reviews'], ascending=False)

<p style="font-family:verdana; text-align:center;"> It can be seen that <b>Nike Air Force 1 '07 LVB</b> is the most popular with a <b>Rating</b> of <i style="color:blue;">5</i> and a number of <i style="color:blue;">6</i> <b>Reviews</b>.</p>

<h3 style="font-family:verdana; text-align:center; background-color:silver"> Adidas </h3>
<p style="font-family:verdana; text-align:center;"> Let's look at Adidas for comparison. First, see which products have the max <b>Rating</b>.</p>

In [None]:
adidas_max_rating = adidas_brand['Rating'].max()
adidas_max_rated_product = adidas_brand[adidas_brand['Rating']==adidas_max_rating]
print("Adidas's Max Rating: {}\n".format(adidas_max_rating))

print("Number of Top Rated Products: {}".format(len(adidas_max_rated_product)))
print("Adidas's Top Products: \n")

for product in adidas_max_rated_product['Product Name']:
    print(product)

<p style="font-family:verdana; text-align:center;"> By obtaining maximum <b> Reviews </b> after getting the top rated products, <b>Men's Adidas Toe Side II Slippers</b> and <b>Men's Adidas Running Asweego Shoes</b> are the most popular with the highest reviews of <i style="color:blue;">99</i> and a max rating of <i style="color:blue;">5</i> stars. </p>

In [None]:
# Select the reviews and product name
adidas_top_reviews = adidas_max_rated_product[['Product Name', 'Reviews']]
adidas_top_reviews.sort_values(by=['Reviews'], ascending=False)

<p style="font-family:verdana; text-align:center;"> Interestingly, despite a lower over-all rating for Nike's products <i style="color:blue;">(2.94)</i>, they have more 5 star rated items than Adidas's. But Adidas is more consistent with their products with many posessing 90+ reviews.</p>
    

<div style="background-color:black"> <center> <h2 style="color:white"> 4) Summary 📝 </h2> </center> </div>

<div align="center">
<p style="font-family:verdana; text-align:center;"> This data set compares Adidas and Nike products, however, it isn't indicative of the performance of companies. Several data cleaning processes were used including: </p>
<ol style="font-family:verdana; background-color: #A9A9A9; color:white; display:table; margin:1px; text-align:left; border:2px solid black; border-radius: 10px; box-shadow: 2px 2px 4px #000000;">
    <li style="padding: 2px 5px;"> Changing the dates </li>
    <li style="padding: 2px 5px;"> Removing duplicates </li>
    <li style="padding: 2px 5px;"> Removing redundant information </li>
    <li style="padding: 2px 5px;"> Changing null values to empty strings </li>
</ol>
</div>

 <p style="font-family:verdana; text-align:center;">When comparing the number of products, Adidas has a lot more that Nike. Additionally, Adidas is categorized in 3 brands: Adidas CORE/NEO, Adidas SPORT PERFORMANCE and Adidas ORIGINALS, while Nike just has the one brand.</p>

 <p style="font-family:verdana; text-align:center;"> Furthermore, overall ratings and reviews are higher for Adidas than Nike with this dataset. The visits for Adidas's products are much higher than Nike's and seems to be in a much smaller time-span. Whilst both have products with the max-ratings of 5 stars, Nike has more 5 star rating across their products while Adidas has much more 5 star reviews per product. <br /> Perhaps this can be indicative of Adidas's huge popularity with particular pieces while Nike has a nicer overall range of products. </p>
 
 