In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob

import networkx as nx
import nxviz

In [None]:
# Merge data into a single file
directory = 'Sales_Data/*'
dfs = [pd.read_csv(filename) for filename in glob.glob(directory)]
df = pd.concat(dfs)
df.to_csv('sales_data.csv', index=False)

In [None]:
# Read new single file
df = pd.read_csv('sales_data.csv')
df.head()

In [None]:
# Drop N/A rows
df.dropna(inplace=True)

In [None]:
# Remove column names as rows
df = df[df['Order ID'] != 'Order ID']

In [None]:
# Separate last column into columns: 'Address', 'State', 'City'
addresses = np.array([address.split(', ') for address in df['Purchase Address'].values])
df['Street'], df['City'], df['State'] = addresses[:, 0], addresses[:, 1], [x[:2] for x in addresses[:, 2]]
df.drop(['Purchase Address', 'Street'], axis='columns', inplace=True)

In [None]:
# Add total price column
df['Order Price'] = pd.to_numeric(df['Quantity Ordered']) * pd.to_numeric(df['Price Each'])

In [None]:
df.to_csv('clean_data.csv', index=False)

In [None]:
# Add column data types int and float
dtypes = ['object', 'object', 'int', 'float', 'object', 'category', 'category', 'float']
col_types = dict(zip(df.columns, dtypes))
df = df.astype(col_types)

# Add column type datetime and set as index
df['Order Date'] = pd.to_datetime(df['Order Date'])

In [None]:
# Column data types and DataFrame Info
df.info()

In [None]:
df.head(10)

In [None]:
# What was the best month for sales?
sales_per_month = df.set_index('Order Date')
sales_per_month = sales_per_month[['Order Price']].resample('M').sum()
sales_per_month.plot()


# Answer
# December was the best month for sales.

In [None]:
# Which city sold the most products?
city_count = df.groupby('City').sum()
city_count.sort_values('Quantity Ordered', ascending=False, inplace=True)
city_count.reset_index(inplace=True)
sns.barplot(x='City', y='Quantity Ordered', data=city_count, order=city_count['City'], palette='spring')
plt.xticks(rotation=45)
plt.show()
# Answer
# San Francisco sells the most products.

In [None]:
# What time should we display advertisements to maximize the likelihood of purchases?
times = pd.to_datetime(df['Order Date'])
times = pd.DataFrame(times.groupby(times.dt.hour).count())
times.index.name, times.columns = 'Hour', ['Purchases']
sns.barplot(x=times.index, y=times['Purchases'], palette='spring')

# Answer:
# Data suggests most purchases occur at 12:00 and 19:00 hours. 

In [None]:
# What products are most often sold together?
sold_together = df[['Order ID', 'Product']]
dummified = pd.get_dummies(sold_together, columns=sold_together.columns[1:], prefix='', prefix_sep='').reset_index(drop=True)
sold_together = dummified.groupby('Order ID').sum()
sold_together = sold_together[sold_together.sum(axis='columns') > 1]

# Figure configuration HeatMap
fig, ax = plt.subplots(figsize=(6, 5))
sales_corr = sold_together.corr()
sns.heatmap(sales_corr, cmap='Greens', vmin=-0.25, vmax=0.4)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.tight_layout()

# Answer
# (Google Phone + USB-C Charging Cables + Wireless Headphones) & (iPhone + Lightning Charging Cable + Apple Airpods)

In [None]:
# What prodcuts are sold the most using graphs with NetworkX
from draw_graph import create_corr_network

# Fill diagonal with 0 and create adjacency matrix
np.fill_diagonal(sales_corr.values, 0)
sales_matrix = np.asmatrix(sales_corr)

# Get metadata of each node
products = sales_corr.columns
prices = df.groupby('Product')['Price Each'].mean()

# Instanciate graph and label nodes
G = nx.from_numpy_matrix(sales_matrix)
G = nx.relabel_nodes(G, lambda x: products[x])

# Plot Graph
create_corr_network(G)

In [None]:
# What product sold the most?
most_sold = df.groupby('Product')[['Quantity Ordered']].sum()
most_sold.sort_values('Quantity Ordered', ascending=False, inplace=True)
most_sold = most_sold.head(6)

sns.barplot(x=most_sold.index, y=most_sold['Quantity Ordered'], palette='spring')
plt.xticks(rotation=90)
plt.show()

In [None]:
def create_corr_network(G):
    # Create a list of edges and weights
    edges, weights = zip(*nx.get_edge_attributes(G,'weight').items())

    # Positions in the vizualization
    positions = nx.circular_layout(G)

    # Figure configuration
    plt.figure(figsize=(15, 15))

    # Draw nodes
    nx.draw_networkx_nodes(G, positions, node_color='#DA70D6', node_size=500, alpha=0.8)

    # Styling for labels
    nx.draw_networkx_labels(G, positions, font_size=8, font_family='sans-serif')

    # Draws the edges
    nx.draw_networkx_edges(G, positions, edge_list=edges, style='solid')

    # Displays the graph without axis
    plt.axis('off')

    # Saves image
    plt.savefig("graph1.png", format="PNG", dpi=320)

    # Show Image
    plt.show() 
    
    return 0

In [None]:
from draw_graph import create_corr_network

create_corr_network(G)