## **<span style="color:#023e8a;">Intro</span>**

**<span style="color:#023e8a;">The competition is dedicated to the product recomendations (H&M)  </span>**

**<span style="color:#023e8a;">Here we have different kinds of data that help us to get good recomendations: </span>**

📸 `images` - images of every article_id

🙋 `articles`  - detailed metadata of every article_id

👔 `customers`  - detailed metadata of every customer_id

🧾 `transactions_train`  - purchases with details

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

In [None]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

## **<span id="Articles" style="color:#023e8a;">2. Articles</span>**

**<span style="color:#023e8a;"> This table contains all h&m articles with details such as a type of product, a color, a product group and other features.</span>**  
**<span style="color:#023e8a;"> Article data description: </span>**

> `article_id` **<span style="color:#023e8a;">: A unique identifier of every article.</span>**  
> `product_code`, `prod_name` **<span style="color:#023e8a;">: A unique identifier of every product and its name (not the same).</span>**  
> `product_type`, `product_type_name` **<span style="color:#023e8a;">: The group of product_code and its name</span>**  
> `graphical_appearance_no`, `graphical_appearance_name` **<span style="color:#023e8a;">: The group of graphics and its name</span>**  
> `colour_group_code`, `colour_group_name` **<span style="color:#023e8a;">: The group of color and its name</span>**  
> `graphical_appearance_no`, `graphical_appearance_name` **<span style="color:#023e8a;">: The group of graphics and its name</span>**  
> `perceived_colour_value_id`, `perceived_colour_value_name`, `perceived_colour_master_id`, `perceived_colour_master_name` **<span style="color:#023e8a;">: The added color info</span>**  
> `department_no`, `department_name`: **<span style="color:#023e8a;">: A unique identifier of every dep and its name</span>**  
> `index_code`, `index_name`: **<span style="color:#023e8a;">: A unique identifier of every index and its name</span>**  
> `index_group_no`, `index_group_name`: **<span style="color:#023e8a;">: A group of indeces and its name</span>**  
> `section_no`, `section_name`: **<span style="color:#023e8a;">: A unique identifier of every section and its name</span>**  
> `garment_group_no`, `garment_group_name`: **<span style="color:#023e8a;">: A unique identifier of every garment and its name</span>**  
> `detail_desc`: **<span style="color:#023e8a;">: Details</span>**  

In [None]:
articles.head()

In [None]:
len(articles)

In [None]:
def quick_dups(column_name, table=articles):
    dups = {}
    duplicates = 0
    for single in table[column_name]:
        if single not in dups:
            dups[single] = 0

        dups[single] += 1
        duplicates += 1

    dups1 = []
    dups2 = []
    for key in dups.keys():
        dups1.append(dups[key])
        dups2.append(key)
        
    print(duplicates, np.min(dups1), np.max(dups1), np.mean(dups1), np.median(dups1))
    
    ind = np.argsort(dups1)

    dups1 = np.flip(np.array(dups1)[ind])
    dups2 = np.flip(np.array(dups2)[ind])
    
    print(dups1[0:10])
    print(dups2[0:10])

In [None]:
quick_dups('product_code')

In [None]:
def quick_bar_chart(
    column_name, figsize, subset=None, table=articles, debug=False,
    labelAxisX='TODO', labelAxisY='TODO', labelTitle='TODO', save=None):
    
    data = {}
    for single in table[column_name]:
        if single not in data:
            data[single] = 0
        data[single] += 1
        
    if debug:
        print(data)

    dataArrKey = []
    dataArrCount = []
    for key in data.keys():
        dataArrKey.append(key)
        dataArrCount.append(data[key])

    if debug:
        print(dataArrKey)

    ind = np.argsort(dataArrCount)

    dataArrCountFlip = np.flip(np.array(dataArrCount)[ind])
    dataArrKeyFlip = np.flip(np.array(dataArrKey)[ind])
    
    if debug:
        print(dataArrKeyFlip)
    
    fig = plt.figure(figsize=figsize, dpi=80)
    ax = fig.add_axes([0,0,1,1])
    if subset != None:
        ax.bar(dataArrKeyFlip[0:subset],dataArrCountFlip[0:subset], facecolor='#CC071E')
    else:
        ax.bar(dataArrKeyFlip,dataArrCountFlip, facecolor='#CC071E')
    fig.align_labels()
    plt.xlabel(labelAxisX)
    plt.ylabel(labelAxisY)
    plt.title(labelTitle)
    
    if save != None:
        plt.savefig(save, dpi=80)
    plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
    plt.show()

In [None]:
#prod_name, detail_desc
#quick_bar_chart('product_type_name', (15, 3))

In [None]:
quick_bar_chart('garment_group_name', (10, 3))

In [None]:
quick_bar_chart('department_name', (15, 3), True)

In [None]:
quick_bar_chart('graphical_appearance_name', (5, 3), subset=20, labelAxisX='Pattern', labelAxisY='Count', labelTitle='Count of Different Patterns')

In [None]:
quick_bar_chart('product_group_name', (5, 3))

In [None]:
quick_bar_chart('colour_group_name', (5, 3),
                labelAxisX='Color',
                labelAxisY='Count',
                labelTitle='Count of different Colors of Clothing',
                save='color-count.png',
                subset=20)

In [None]:
quick_bar_chart('index_group_name', (3, 3), labelAxisX='Group', labelAxisY='Count', labelTitle='Count of Products over Groups')

In [None]:
quick_bar_chart('index_group_name', (3, 3)) # same as above, redundant column

In [None]:
quick_bar_chart('section_name', (15, 3))

**<span style="color:#023e8a;"> Customers data description: </span>**

> `customer_id` **<span style="color:#023e8a;">: A unique identifier of every customer</span>**  
> `FN` **<span style="color:#023e8a;">: 1 or missed </span>**  
> `Active` **<span style="color:#023e8a;">: 1 or missed</span>**  
> `club_member_status` **<span style="color:#023e8a;">: Status in club</span>**  
> `fashion_news_frequency` **<span style="color:#023e8a;">: How often H&M may send news to customer</span>**  
> `age` **<span style="color:#023e8a;">: The current age</span>**  
> `postal_code` **<span style="color:#023e8a;">: Postal code of customer</span>**  

In [None]:
pd.options.display.max_rows = 50
customers.head()

In [None]:
#quick_bar_chart('postal_code', (3, 3), False, customers, True)
len(customers)

In [None]:
quick_dups('postal_code', customers)

In [None]:
quick_bar_chart('fashion_news_frequency', (3, 3), False, customers)

In [None]:
quick_bar_chart('club_member_status', (3, 3), False, customers)

In [None]:
customers['age']

In [None]:
np.max(customers['age']), np.min(customers['age']), np.mean(customers['age']), np.median(customers['age'])

In [None]:
plt.figure(figsize=(10, 6), dpi=80)
n, bins, patches = plt.hist(customers['age'], 99-16, density=False, facecolor='#CC071E')#, facecolor='g', alpha=0.75)
plt.xlabel('Age (Years)')
plt.ylabel('Count')
plt.title('Histogram of H&M Customer Ages')
plt.xlim(15, 100)
plt.ylim(0, 70000)
plt.grid(True)
plt.xticks(np.arange(15, 105, step=5))
plt.savefig("histogram-customer-ages.png", dpi=80, format='png')
plt.show()

## **<span id="Transactions" style="color:#023e8a;">4. Transactions</span>**

[**<span style="color:#FEF1FE;background-color:#023e8a;border-radius: 5px;padding: 2px">Go to Table of Content</span>**](#Content)

**<span style="color:#023e8a;"> Transactions data description: </span>**

> `t_dat` **<span style="color:#023e8a;">: A unique identifier of every customer</span>**  
> `customer_id` **<span style="color:#023e8a;">: A unique identifier of every customer </span>**  **<span style="color:#FF0000;">(in </span>** `customers` **<span style="color:#FF0000;"> table)</span>**  
> `article_id` **<span style="color:#023e8a;">: A unique identifier of every article</span>**  **<span style="color:#FF0000;">(in </span>** `articles` **<span style="color:#FF0000;"> table)</span>**  
> `price` **<span style="color:#023e8a;">: Price of purchase</span>**  
> `sales_channel_id` **<span style="color:#023e8a;">: 1 or 2</span>**  

In [None]:
transactions.head()

In [None]:
len(transactions)

In [None]:
from datetime import datetime

In [None]:
datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

In [None]:
datetime.strptime('2018-09-20', '%Y-%m-%d')

In [None]:
transactionsDay1 = {}
transactionsMonth1 = {}
for index in range(0, len(transactions['t_dat']),100):
    
    if index % int(len(transactions['t_dat'])/100) == 0:
        print(index / len(transactions['t_dat']))

    date = transactions['t_dat'][index]
    
    dateF = datetime.strptime(date, '%Y-%m-%d')
    
    dateS1 = dateF.strftime('%Y-%m-%d')
    dateS2 = dateF.strftime('%Y-%m')
    
    if dateS1 not in transactionsDay1:
        transactionsDay1[dateS1] = 0
    
    if dateS2 not in transactionsMonth1:
        transactionsMonth1[dateS2] = 0
    
    transactionsDay1[dateS1] += 1
    transactionsMonth1[dateS2] += 1
    
datesDay = []
transactionsDay = []
for date in transactionsDay1.keys():
    datesDay.append(datetime.strptime(date, '%Y-%m-%d'))
    transactionsDay.append(transactionsDay1[date])
    
datesDay7 = []
transactionsDay7 = []
transactionsDay7Avg = []
for i in range(0,len(datesDay)-7,7):
    transactionsSum = 0
    for j in range(i,i+7):
        transactionsSum += transactionsDay[j]
    
    datesDay7.append(datesDay[i])
    transactionsDay7.append(transactionsSum)
    transactionsDay7Avg.append(transactionsSum/7)
    
datesMonth = []
transactionsMonth = []
for date in transactionsMonth1.keys():
    datesMonth.append(datetime.strptime(date, '%Y-%m'))
    transactionsMonth.append(transactionsMonth1[date])

In [None]:
np.min(datesDay), np.max(datesDay), len(datesDay), len(datesDay)/365, datesDay[0:10]

In [None]:
np.min(datesMonth), np.max(datesMonth), len(datesMonth), len(datesMonth)/12

In [None]:
np.min(transactionsDay), np.max(transactionsDay), transactionsDay[0:10]

In [None]:
np.min(transactionsMonth), np.max(transactionsMonth), transactionsMonth[0:10]

In [None]:
plt.figure(figsize=(10, 6), dpi=80)
plt.plot_date(datesDay, transactionsDay)
plt.grid(True)
plt.xlabel('Date (YYYY-MM)')
plt.ylabel('Number of Transactions')
plt.title('Transactions per Day')
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6), dpi=80)
ax = fig.add_axes([0,0,1,1])
ax.bar(datesDay,transactionsDay)
fig.align_labels()
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.show()

In [None]:
plt.figure(figsize=(5, 3), dpi=80)
plt.plot_date(datesDay7, transactionsDay7, color='#CC071E')
plt.xlabel('Date (YYYY-MM)')
plt.ylabel('Number of Transactions')
plt.title('Transactions per Week')
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=80)
plt.plot_date(datesDay7, transactionsDay7Avg)
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=80)
plt.plot_date(datesMonth, transactionsMonth, color='#CC071E')
plt.xlabel('Date (YYYY-MM)')
plt.ylabel('Number of Transactions')
plt.title('Transactions per Month')
plt.grid(True)
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6), dpi=80)
ax = fig.add_axes([0,0,1,1])
ax.bar(datesMonth,transactionsMonth)
fig.align_labels()
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.show()

In [None]:
quick_bar_chart('t_dat', (3, 3), False, transactions)