In [301]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
from scipy.stats import kendalltau
from matplotlib.ticker import FuncFormatter, MultipleLocator
import matplotlib.cm as cm
from matplotlib.widgets import Slider

In [302]:
def get_data(location):
    data = pd.read_csv(location, header=None)
    data = data.sample(frac=0.03)
   
    if location.endswith('users.csv'):
        data.columns = ['userId', 'registerCountry', 'signupTime']
    elif location.endswith('conversions.csv'):
        data.columns = ['userId', 'itemId', 'price', 'quantity', 'timestamp']
    elif location.endswith('items.csv'):
        data.columns = ['itemId', 'style', 'personality', 'color', 'theme', 'price', 'category']
    elif location.endswith('users_ads.csv'):
        data.columns = ['userId', 'utmSource', 'utmCampaign']
    return data


Obtains data from files.

In [303]:
def getYearMonth(s):
    return s.split("-")[1]+"-"+s.split("-")[2].split(" ")[0]

Util function: extracts months from dates.

In [304]:
 def number_of_new_users_per_month():
    users = get_data('../../fb2015/users.csv')
    
    users = users.dropna(subset=['signupTime'])
    users = users[~users.signupTime.isin(['None'])]
    
    users['month'] = pd.DatetimeIndex(users['signupTime']).month
    users['year'] = pd.DatetimeIndex(users['signupTime']).year
    
    grouped = users.groupby(['year', 'month'])['userId'].count()
    sortedd = grouped.sort_index()
    
    ax = sortedd.plot(kind='bar')
    
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of new users')
    ax.set_title("Number of new users per month.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Displays number of newly registered users per month.

In [305]:
def number_of_conversions_per_month():
    conv = get_data('../../fb2015/conversions.csv')
    
    conv = conv.dropna(subset=['timestamp'])
    conv = conv[~conv.timestamp.isin(['None'])]
    
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    
    grouped = conv.groupby(['year', 'month']).count()
    sortedd = grouped.sort_index()
    
    ax = sortedd.plot(legend=None)
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of conversions')
    ax.set_title("Number of conversions per month.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays number of conversions per month.

In [306]:
def number_of_items_purchased_per_month():
    conv = get_data('../../fb2015/conversions.csv')
    
    conv = conv.dropna(subset=['timestamp', 'quantity'])
    conv = conv[~conv.timestamp.isin(['None'])]
    conv = conv[~conv.quantity.isin(['None'])]
    
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    
    grouped = conv.groupby(['year', 'month'])['quantity'].count()
    sortedd = grouped.sort_index()
    
    ax = sortedd.plot(kind='bar')
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of items')
    ax.set_title("Number of items purchased per month.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays number of items purchased per month.

In [307]:
def income_per_month():
    conv = get_data('../../fb2015/conversions.csv')
    
    conv = conv.dropna(subset=['timestamp', 'price'])
    conv = conv[~conv.timestamp.isin(['None'])]
    conv = conv[~conv.price.isin(['None'])]
    
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    grouped = conv.groupby(['year', 'month'])['price'].sum()
    sortedd = grouped.sort_index()
    
    ax = sortedd.plot() 
    ax.set_xlabel('Month')
    ax.set_ylabel('Income')
    ax.set_title("Income per month.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays income per month.

In [308]:
def number_of_purchases_per_user_in_the_first_week_after_signing_in():
    users = get_data('../../fb2015/users.csv')
    users['week_after'] = pd.DatetimeIndex(users['signupTime'])+pd.Timedelta(days=7)
    
    conv = get_data('../../fb2015/conversions.csv')
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])
    
    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    
    joined = joined.dropna(subset=['timestamp', 'userId', 'quantity'])
    joined = joined[~joined.timestamp.isin(['None'])]
    joined = joined[~joined.userId.isin(['None'])]
    joined = joined[~joined.quantity.isin(['None'])]
    
    grouped = joined[joined['timestamp'] <= joined['week_after']].groupby('userId')['quantity'].sum()
    
    ax = grouped.plot(kind='bar')
    ax.set_xlabel('Category')
    ax.set_ylabel('Quantity')
    ax.set_title("Number of items purchased in the first week after signing in, grouped by userId.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays number of items purchased during the first week after signing in.

In [309]:
def number_of_items_purchased_from_particular_category_grouped_by_country(category):
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    users = get_data('../../fb2015/users.csv')
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
    
    joined = joined.dropna(subset=['category', 'quantity', 'registerCountry'])
    joined = joined[~joined.registerCountry.isin(['None'])]
    joined = joined[~joined.category.isin(['None'])]
    joined = joined[~joined.quantity.isin(['None'])]
    
    
    filtered = joined[joined['category']==category]
    grouped = filtered.groupby('registerCountry')['quantity'].sum()
    
    ax = grouped.plot(kind='bar')
    ax.set_xlabel('Category')
    ax.set_ylabel('Quantity')
    ax.set_title("Number of items from "+str(category)+" category, grouped by country.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Displays number of items from particular category purchased by people from all countries.

In [310]:
def number_of_items_purchased_in_particular_country_grouped_by_category(country):
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    users = get_data('../../fb2015/users.csv')
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
    
    joined = joined.dropna(subset=['category', 'quantity', 'registerCountry'])
    joined = joined[~joined.registerCountry.isin(['None'])]
    joined = joined[~joined.category.isin(['None'])]
    joined = joined[~joined.quantity.isin(['None'])]
    
    filtered = joined[joined['registerCountry']==country]
    grouped = filtered.groupby('category')['quantity'].sum()
    
    ax = grouped.plot(kind='bar')
    ax.set_xlabel('Category')
    ax.set_ylabel('Quantity')
    ax.set_title("Number of items purchased in " + country + ", grouped by category.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Displays number of items from all cattegories purchased in particular country.

In [311]:
def number_of_purchased_items_grouped_by_categories_in_all_countries():
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    users = get_data('../../fb2015/users.csv')
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
 
    joined = joined.dropna(subset=['category', 'quantity', 'registerCountry'])
    joined = joined[~joined.registerCountry.isin(['None'])]
    joined = joined[~joined.category.isin(['None'])]
    joined = joined[~joined.quantity.isin(['None'])]

    y_labels = []
    x_labels = []
    y_ticks = []
    x_ticks = []
    x = []
    y = []

    for i, row in enumerate(joined['registerCountry'].unique()):
        y_labels.append(row)
        y_ticks.append(i)
        y.append(i)

    for i, row in enumerate(joined['category'].unique()):
        x_labels.append(row)
        x_ticks.append(i)
        x.append(i)
    
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    
    for j, country in enumerate(y_labels):
        r = np.random.rand(1)
        color = cm.rainbow(np.linspace(r, r+1, 1))       
        for i, category in enumerate(x_labels):
            countryFiltered = joined[joined['registerCountry']==country]
            filtered = countryFiltered[countryFiltered['category']==category]
            area=filtered['quantity'].sum()*20
            ax.scatter(x=i, y=j, s=area, c=color, alpha=0.7)  
    
    ax.set(xticks=range(len(x_labels)), xticklabels=x_labels,
       yticks=range(len(y_labels)), yticklabels=y_labels)
    
    ax.set_xlabel('Category')
    ax.set_ylabel('Country')
    ax.set_title('Sum of purchased items, grouped by country and category.', fontsize=18)

    plt.tick_params(axis='x', labelsize=8)
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays general view on sum of purchased items grouped by country and category.

In [312]:
#number_of_new_users_per_month()
#number_of_conversions_per_month()
#number_of_items_purchased_per_month()
#income_per_month()
#number_of_purchases_per_user_in_the_first_week_after_signing_in()
#number_of_items_purchased_from_particular_category_grouped_by_country('category_arg')
#number_of_items_purchased_in_particular_country_grouped_by_category('country_arg')
#number_of_purchased_items_grouped_by_categories_in_all_countries()