In [632]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
from scipy.stats import kendalltau
from matplotlib.ticker import FuncFormatter, MultipleLocator
import matplotlib.cm as cm

In [633]:
def get_data(location):
    data = pd.read_csv(location, header=None)
    data = data.sample(frac=0.05)
   
    if location.endswith('users.csv'):
        data.columns = ['userId', 'registerCountry', 'signupTime']
    elif location.endswith('conversions.csv'):
        data.columns = ['userId', 'itemId', 'price', 'quantity', 'timestamp']
    elif location.endswith('items.csv'):
        data.columns = ['itemId', 'style', 'personality', 'color', 'theme', 'price', 'category']
    elif location.endswith('users_ads.csv'):
        data.columns = ['userId', 'utmSource', 'utmCampaign']
    return data


Obtains data from files.

In [634]:
def getYearMonth(s):
    return s.split("-")[1]+"-"+s.split("-")[2].split(" ")[0]

Util function: extracts months from dates.

In [635]:
 def number_of_new_users_per_month():
    users = get_data('../../fb2015/users.csv')
    users['month'] = pd.DatetimeIndex(users['signupTime']).month
    users['year'] = pd.DatetimeIndex(users['signupTime']).year
    users_per_yearmonth = users.groupby(['year', 'month'])['userId'].count()
    users_per_yearmonth = users_per_yearmonth.sort_index()
    users_per_yearmonth.plot(kind='bar')
    plt.show()

Displays number of newly registered users per month.

In [636]:
def number_of_conversions_per_month():
    conv = get_data('../../fb2015/conversions.csv')
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    conv_per_yearmonth = conv.groupby(['year', 'month']).count()
    conv_per_yearmonth = conv_per_yearmonth.sort_index()
    conv_per_yearmonth.plot()
    plt.show()

Displays number of conversions per month.

In [637]:
def number_of_items_purchased_per_month():
    conv = get_data('../../fb2015/conversions.csv')
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    conv_per_yearmonth = conv.groupby(['year', 'month'])['quantity'].count()
    conv_per_yearmonth = conv_per_yearmonth.sort_index()
    conv_per_yearmonth.plot()
    plt.show()

Displays number of items purchased per month.

In [638]:
def income_per_month():
    conv = get_data('../../fb2015/conversions.csv')
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    conv_per_yearmonth = conv.groupby(['year', 'month'])['price'].sum()
    conv_per_yearmonth = conv_per_yearmonth.sort_index()
    conv_per_yearmonth.plot()
    plt.show()

Displays income per month.

In [639]:
def number_of_purchases_per_user_in_the_first_week():
    users = get_data('../../fb2015/users.csv')
    users['week_after'] = pd.DatetimeIndex(users['signupTime'])+pd.Timedelta(days=7)
    
    conv = get_data('../../fb2015/conversions.csv')
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])
    
    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    conversions_per_user_in_the_first_week = joined[joined['timestamp'] <= joined['week_after']].groupby('userId')['quantity'].sum()
    
    conversions_per_user_in_the_first_week.plot(kind='bar')
    plt.show()

Displays number of items purchased during the first week after signing in.

In [640]:
def number_of_items_from_particular_category_purchased_in_all_countries():
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    
    joined = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    count_of_items = joined.groupby('category')['quantity'].sum()
    
    count_of_items.plot(kind='bar', title='Number of items purchased in all countries, grouped by category.')
    plt.show()

Displays number of items from particular category purchased by people from all countries.

In [641]:
def number_of_items_purchased_in_particular_country_grouped_by_category(country):
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    users = get_data('../../fb2015/users.csv')
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
    filtered = joined[joined['registerCountry']==country]
    grouped = filtered.groupby('category')['quantity'].sum()
    
    grouped.plot(kind='bar', title="Number of items purchased in " + country + ", grouped by category.")
    plt.show()

Displays number of items from all cattegories purchased in particular country.

In [642]:
def number_of_purchased_items_grouped_by_categories_in_all_countries():
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    users = get_data('../../fb2015/users.csv')
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
 
    joined = joined.dropna(subset=['category', 'quantity', 'registerCountry'])
    
    y_labels = []
    x_labels = []
    y_ticks = []
    x_ticks = []
    x = []
    y = []

    for i, row in enumerate(joined['registerCountry'].unique()):
        y_labels.append(row)
        y_ticks.append(i)
        y.append(i)
    
    for i, row in enumerate(joined['category'].unique()):
        x_labels.append(row)
        x_ticks.append(i)
        x.append(i)
    
    fig = plt.figure(figsize=(len(x)*0.6, max(len(y)+2, 7)))
    ax = fig.add_subplot(1, 1, 1)
    
    for j, country in enumerate(y_labels):
        if country == '' or country == 'None' or country == None:
            continue
        area = []
        color = cm.rainbow(np.linspace(j, j+1, 1))       
        for i, category in enumerate(x_labels):
            if category == '' or category == 'None' or category == None:
                continue
            countryFiltered = joined[joined['registerCountry']==country]
            filtered = countryFiltered[countryFiltered['category']==category]
            area=filtered['quantity'].sum()*20
            ax.scatter(x=i, y=j, s=area, c=color, alpha=0.5)  
    
    ax.set(xticks=range(len(x_labels)), xticklabels=x_labels,
       yticks=range(len(y_labels)), yticklabels=y_labels)
    plt.tick_params(axis='x', labelsize=8)
    plt.xticks(rotation=70)Displays general view on sum of purchased items grouped by country and category
   
    plt.show()

Displays general view on sum of purchased items grouped by country and category.

In [643]:
number_of_purchased_items_grouped_by_categories_in_all_countries()