In [None]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np

In [None]:
read_df = pd.read_csv('data/preprocessed_transactions.csv')
df = read_df.copy()

print(df.head(10))
print(df.info())
print(df.shape)
print(df.size)

In [None]:
df.isna().sum()

In [None]:
df = df[df.UserId > 0]
df = df[df.ItemCode > 0]
df = df[df.NumberOfItemsPurchased > 0]
df = df[df.CostPerItem > 0]
df = df[df.ItemDescription.notna()]
df = df[df.TransactionTime.str[-4:] != '2028']
df = df[:1000]
# df.to_csv('./data/preprocesed_data.csv',index=False)
print(df.info())
print(df.head(10))

In [None]:
# Lets do some exploratory data analysis now. Lets see the no. of transactions being done in each part of the year.
df.TransactionTime = pd.to_datetime(df.TransactionTime)
df['month_year'] = pd.to_datetime(df.TransactionTime).dt.to_period('M')
df.sort_values(by=['month_year'], inplace=True)
Ser = df.groupby('month_year').TransactionId.nunique()
x = np.arange(0, len(Ser), 1)

In [None]:
style.use('ggplot')
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(111)
ax1.plot(x, Ser, color='k')
ax1.fill_between(x, Ser, color='r', alpha=0.5)
ax1.set_xticks(x)
ax1.set_xticklabels(Ser.index)
plt.xlabel('Time period')
plt.ylabel('No. of transactions')
# plt.savefig('./images/graph1.png')

In [None]:
Ser = df.groupby('TransactionId').ItemDescription.nunique()
Ser.describe()

In [None]:
bins = [0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]
fig = plt.figure(figsize = (10,10))
ax1 = fig.add_subplot(111)
ax1.hist(Ser, bins, histtype = 'bar', rwidth = 0.5)
ax1.set_xticks(bins)
plt.xlabel('No. of items')
plt.ylabel('No. of transactions')
plt.show()

In [None]:
df['total_cost_item'] = df.NumberOfItemsPurchased*df.CostPerItem
Ser = df.groupby('ItemDescription').total_cost_item.sum()
Ser.sort_values(ascending = False, inplace = True)
Ser = Ser[:10]
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111)
ax.barh(Ser.index, Ser, height = 0.5)

In [None]:
df = pd.read_csv('./data/preprocesed_data.csv')
df_set = df.groupby(['TransactionId', 'ItemDescription']).NumberOfItemsPurchased.sum().unstack().reset_index().fillna(0).set_index('TransactionId')
df_set.head()

In [None]:
print(df_set.info())
print(df_set.shape)

In [None]:
# df_set = df_set.applymap(lambda x: 0 if x <= 0 else 1) # tooooooooooo slowwwwwwwwww
df_set = (df_set > 0).astype(np.int8)
# df_set.to_csv('./data/transactions.csv',index=False)
df_set

In [None]:
# take only 10000 rows to be able to process data because of not enought memory
df_set = pd.read_csv('./data/transactions.csv')
frequent_itemsets = fpgrowth(df_set, min_support = 0.015, max_len=4, use_colnames = True)
# frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: tuple(x))
# frequent_itemsets.to_csv('./data/frequent_itemsets.csv',index=False)
frequent_itemsets

In [None]:
frequent_itemsets.sort_values('support', ascending=False)

In [None]:
frequent_itemsets = pd.read_csv('./data/frequent_itemsets.csv')
frequent_itemsets

In [None]:

top_items = frequent_itemsets.sort_values('support', ascending = False)[:20]
for i in range(len(top_items.itemsets)):
    top_items.itemsets.iloc[i] = str(list(top_items.itemsets.iloc[i]))
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111)
ax.bar(top_items.itemsets, top_items.support)
for label in ax.xaxis.get_ticklabels():
    label.set_rotation(90)
plt.xlabel('Item')
plt.ylabel('Support')
# plt.savefig('./images/figure1.png')
plt.show()

In [None]:
rules = association_rules(
    frequent_itemsets, metric='confidence', min_threshold=0.2)
rules

In [None]:
rules.describe()

In [None]:
top_rules = rules.sort_values('confidence', ascending=False)[:10]
top_rules

In [None]:
rules.describe()

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
ax.scatter(top_rules.support, top_rules.confidence, top_rules.lift)

In [None]:
import networkx as nx
G1 = nx.DiGraph()
color_map = []
N = 50
colors = np.random.rand(N)
strs = ['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9']
for i in range(10):
    G1.add_nodes_from('r'+str(i))
    for a in top_rules.iloc[i]['antecedents']:
        G1.add_nodes_from([a])
        G1.add_edge(a, 'r'+str(i), color = colors[i], weight = 2)
    for c in top_rules.iloc[i]['consequents']:
        G1.add_nodes_from([c])
        G1.add_edge('r'+str(i), c, color = colors[i], weight = 2)
for node in G1:
    found_a_string = False
    for item in strs:
        if node == item:
            found_a_string = True
    if found_a_string:
        color_map.append('red')
    else:
        color_map.append('black')
edges = G1.edges()
colors = [G1[u][v]['color'] for u,v in edges]
weights = [G1[u][v]['weight'] for u,v in edges]
pos = nx.spring_layout(G1, k = 16, scale = 1)
fig = plt.figure(figsize = (20,20))
nx.draw(G1, pos, node_color = color_map, edge_color = colors, width = weights, font_size = 16, with_labels = False)
for p in pos:
    pos[p][1] += 0.07

nx.draw_networkx_labels(G1, pos)
plt.savefig('./images/web1.png')
plt.show()