In [55]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
from scipy.stats import kendalltau
from matplotlib.ticker import FuncFormatter, MultipleLocator
import matplotlib.cm as cm
from matplotlib.widgets import Slider

In [56]:
global_users = pd.read_csv('../../fb2015/users.csv', header=None).sample(frac=0.03)
global_users.columns = ['userId', 'registerCountry', 'signupTime']
global_conv = pd.read_csv('../../fb2015/conversions.csv', header=None).sample(frac=0.03)
global_conv.columns = ['userId', 'itemId', 'price', 'quantity', 'timestamp']
global_items = pd.read_csv('../../fb2015/items.csv', header=None).sample(frac=0.03)
global_items.columns = ['itemId', 'style', 'personality', 'color', 'theme', 'price', 'category']
global_users_ads = pd.read_csv('../../fb2015/users_ads.csv', header=None).sample(frac=0.03)
global_users_ads.columns = ['userId', 'utmSource', 'utmCampaign', 'utmMedium', 'utmTerm', 'utmContent']

In [57]:
#def get_data(location):
#    data = pd.read_csv(location, header=None)
#    data = data.sample(frac=0.03)
#   
#    if location.endswith('users.csv'):
#        data.columns = ['userId', 'registerCountry', 'signupTime']
#    elif location.endswith('conversions.csv'):
#        data.columns = ['userId', 'itemId', 'price', 'quantity', 'timestamp']
#    elif location.endswith('items.csv'):
#        data.columns = ['itemId', 'style', 'personality', 'color', 'theme', 'price', 'category']
#    elif location.endswith('users_ads.csv'):
#        data.columns = ['userId', 'utmSource', 'utmCampaign', 'utmMedium', 'utmTerm', 'utmContent']
#        
#    return data


Obtains data from files.

In [58]:
def getYearMonth(s):
    return s.split("-")[1]+"-"+s.split("-")[2].split(" ")[0]

Util function: extracts months from dates.

In [59]:
 def number_of_new_users_per_month():
    users = global_users
    
    users = users.dropna(subset=['signupTime'])
    users = users[~users.signupTime.isin(['None'])]
    
    users['month'] = pd.DatetimeIndex(users['signupTime']).month
    users['year'] = pd.DatetimeIndex(users['signupTime']).year
    
    grouped = users.groupby(['year', 'month'])['userId'].count()
    sortedd = grouped.sort_index()
    
    ax = sortedd.plot(kind='bar')
    
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of new users')
    ax.set_title("Number of new users per month.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Displays number of newly registered users per month.

In [60]:
def number_of_conversions_per_month():
    conv = global_conv
    
    conv = conv.dropna(subset=['timestamp'])
    conv = conv[~conv.timestamp.isin(['None'])]
    
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    
    grouped = conv.groupby(['year', 'month']).count()
    sortedd = grouped.sort_index()
    
    ax = sortedd.plot(legend=None)
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of conversions')
    ax.set_title("Number of conversions per month.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays number of conversions per month.

In [61]:
def number_of_items_purchased_per_month():
    conv = global_conv
    
    conv = conv.dropna(subset=['timestamp', 'quantity'])
    conv = conv[~conv.timestamp.isin(['None'])]
    conv = conv[~conv.quantity.isin(['None'])]
    
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    
    grouped = conv.groupby(['year', 'month'])['quantity'].count()
    sortedd = grouped.sort_index()
    
    ax = sortedd.plot(kind='bar')
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of items')
    ax.set_title("Number of items purchased per month.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays number of items purchased per month.

In [62]:
def income_per_month():
    conv = global_conv
    
    conv = conv.dropna(subset=['timestamp', 'price'])
    conv = conv[~conv.timestamp.isin(['None'])]
    conv = conv[~conv.price.isin(['None'])]
    
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    grouped = conv.groupby(['year', 'month'])['price'].sum()
    sortedd = grouped.sort_index()
    
    ax = sortedd.plot() 
    ax.set_xlabel('Month')
    ax.set_ylabel('Income')
    ax.set_title("Income per month.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays income per month.

In [63]:
def number_of_purchases_per_user_in_the_first_week_after_signing_in():
    users = global_users
    conv = global_conv
    
    users['week_after'] = pd.DatetimeIndex(users['signupTime'])+pd.Timedelta(days=7)
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])
    
    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    
    joined = joined.dropna(subset=['timestamp', 'userId', 'quantity'])
    joined = joined[~joined.timestamp.isin(['None'])]
    joined = joined[~joined.userId.isin(['None'])]
    joined = joined[~joined.quantity.isin(['None'])]
    
    grouped = joined[joined['timestamp'] <= joined['week_after']].groupby('userId')['quantity'].sum()
    
    ax = grouped.plot(kind='bar')
    ax.set_xlabel('Category')
    ax.set_ylabel('Quantity')
    ax.set_title("Number of items purchased in the first week after signing in, grouped by userId.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays number of items purchased during the first week after signing in.

In [64]:
def number_of_items_purchased_from_particular_category_grouped_by_country(category):
    conv = global_conv
    items = global_items
    users = global_users
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
    
    joined = joined.dropna(subset=['category', 'quantity', 'registerCountry'])
    joined = joined[~joined.registerCountry.isin(['None'])]
    joined = joined[~joined.category.isin(['None'])]
    joined = joined[~joined.quantity.isin(['None'])]
    
    
    filtered = joined[joined['category']==category]
    grouped = filtered.groupby('registerCountry')['quantity'].sum()
    
    ax = grouped.plot(kind='bar')
    ax.set_xlabel('Category')
    ax.set_ylabel('Quantity')
    ax.set_title("Number of items from "+str(category)+" category, grouped by country.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Displays number of items from particular category purchased by people from all countries.

In [65]:
def number_of_items_purchased_in_particular_country_grouped_by_category(country):
    conv = global_conv
    items = global_items
    users = global_users
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
    
    joined = joined.dropna(subset=['category', 'quantity', 'registerCountry'])
    joined = joined[~joined.registerCountry.isin(['None'])]
    joined = joined[~joined.category.isin(['None'])]
    joined = joined[~joined.quantity.isin(['None'])]
    
    filtered = joined[joined['registerCountry']==country]
    grouped = filtered.groupby('category')['quantity'].sum()
    
    ax = grouped.plot(kind='bar')
    ax.set_xlabel('Category')
    ax.set_ylabel('Quantity')
    ax.set_title("Number of items purchased in " + country + ", grouped by category.", fontsize=18)
    
    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Displays number of items from all cattegories purchased in particular country.

In [66]:
def number_of_purchased_items_grouped_by_categories_in_all_countries():
    conv = global_conv
    items = global_items
    users = global_users
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
 
    joined = joined.dropna(subset=['category', 'quantity', 'registerCountry'])
    joined = joined[~joined.registerCountry.isin(['None'])]
    joined = joined[~joined.category.isin(['None'])]
    joined = joined[~joined.quantity.isin(['None'])]

    y_labels = []
    x_labels = []
    y_ticks = []
    x_ticks = []
    x = []
    y = []

    for i, row in enumerate(joined['registerCountry'].unique()):
        y_labels.append(row)
        y_ticks.append(i)
        y.append(i)

    for i, row in enumerate(joined['category'].unique()):
        x_labels.append(row)
        x_ticks.append(i)
        x.append(i)
    
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    
    for j, country in enumerate(y_labels):
        r = np.random.rand(1)
        color = cm.rainbow(np.linspace(r, r+1, 1))       
        for i, category in enumerate(x_labels):
            countryFiltered = joined[joined['registerCountry']==country]
            filtered = countryFiltered[countryFiltered['category']==category]
            area=filtered['quantity'].sum()*20
            ax.scatter(x=i, y=j, s=area, c=color, alpha=0.7)  
    
    ax.set(xticks=range(len(x_labels)), xticklabels=x_labels,
       yticks=range(len(y_labels)), yticklabels=y_labels)
    
    ax.set_xlabel('Category')
    ax.set_ylabel('Country')
    ax.set_title('Sum of purchased items, grouped by country and category.', fontsize=18)

    plt.tick_params(axis='x', labelsize=8)
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays general view on sum of purchased items grouped by country and category.

In [67]:
def number_of_purchased_items_after_seeing_campaigns_grouped_by_categories():
    conv = global_conv
    users_ads = global_users_ads
    users = global_users
    items = global_items

    conv = conv.dropna(subset=['timestamp'])
    users = users.dropna(subset=['signupTime'])
    users_ads = users_ads.dropna(subset=['utmCampaign'])
    items = items.dropna(subset=['category'])
    conv = conv[~conv.timestamp.isin(['None'])]
    users = users[~users.signupTime.isin(['None'])]
    users_ads = users_ads[~users_ads.utmCampaign.isin(['None'])]
    items = items[~items.category.isin(['None'])]
    
    users['week_after'] = pd.DatetimeIndex(users['signupTime'])+pd.Timedelta(days=7)
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])
    
    joined = pd.merge(users, users_ads, left_on='userId', right_on='userId', how='inner')
    joined = pd.merge(joined, conv, left_on='userId', right_on='userId', how='inner')
    joined = pd.merge(joined, items, left_on='itemId', right_on='itemId', how='inner')
    
    filtered = joined[joined['timestamp'] <= joined['week_after']]
    
    y_labels = []
    x_labels = []
    y_ticks = []
    x_ticks = []
    x = []
    y = []

    for i, row in enumerate(filtered['utmCampaign'].unique()):
        y_labels.append(row)
        y_ticks.append(i)
        y.append(i)

    for i, row in enumerate(filtered['category'].unique()):
        x_labels.append(row)
        x_ticks.append(i)
        x.append(i)
    
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    
    for j, utmCampaign in enumerate(y_labels):
        r = np.random.rand(1)
        color = cm.rainbow(np.linspace(r, r+1, 1))       
        for i, category in enumerate(x_labels):
            utmCampaignFiltered = filtered[filtered['utmCampaign']==utmCampaign]
            categoryFiltered = utmCampaignFiltered[utmCampaignFiltered['category']==category]
            area=categoryFiltered['quantity'].sum()*20
            ax.scatter(x=i, y=j, s=area, c=color, alpha=0.7)  
    
    ax.set(xticks=range(len(x_labels)), xticklabels=x_labels,
       yticks=range(len(y_labels)), yticklabels=y_labels)
    
    ax.set_xlabel('Category')
    ax.set_ylabel('utmCampaign')
    ax.set_title('Sum of purchased items in the first week after signing in, grouped by campaigns and category.', fontsize=18)

    plt.tick_params(axis='x', labelsize=8)
    plt.xticks(rotation=70)
    plt.tight_layout()
    
    plt.show()

Displays general view on sum of purchased items in the first week after signing in, connected with campaigns being seen, grouped by category and campaign.

In [74]:
def average_number_of_purchased_items_during_the_first_month_after_signing_in():
    conv = global_conv
    users = global_users

    conv = conv.dropna(subset=['timestamp', 'quantity'])
    users = users.dropna(subset=['signupTime'])
    conv = conv[~conv.timestamp.isin(['None'])]
    conv = conv[~conv.quantity.isin(['None'])]
    users = users[~users.signupTime.isin(['None'])]

    users['signupTime'] = pd.DatetimeIndex(users['signupTime'])
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])
    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    joined['purchase_day'] = pd.to_timedelta(joined['timestamp'])-pd.to_timedelta(joined['signupTime'])

    filtered = joined[joined['purchase_day']>=pd.Timedelta(days=0)]
    filtered = filtered[filtered['purchase_day']<pd.Timedelta(days=30)]
    filtered['purchase_day'] = pd.DatetimeIndex(filtered['purchase_day']).day

    aver = filtered.groupby(['purchase_day', 'userId'])['quantity'].sum()

    summ = [0 for x in range(0,31)]
    countt = [0 for x in range(0,31)]
    average = [0 for x in range(0,31)]

    for x in aver.iteritems():
        day = x[0][0]
        quantitySum = x[1]
        summ[day] += quantitySum
        countt[day] += 1

    for y in range(0, 31):
        if countt[y] != 0:
            average[y] = summ[y]/countt[y]

    ax = sns.barplot(y=average, x=[i for i in range(0, 31)])
    ax.set_xlabel('Day after signing in')
    ax.set_ylabel('Average quantity')
    ax.set_title('Average sum of purchases (quantities) during first month after signing in.', fontsize=18)

    plt.tick_params(axis='x', labelsize=8)
    plt.xticks(rotation=70)
    plt.tight_layout()

    plt.show()



Displays average sum of purchases during first month after signing in.

In [None]:
#number_of_new_users_per_month()
#number_of_conversions_per_month()
#number_of_items_purchased_per_month()
#income_per_month()
#number_of_purchases_per_user_in_the_first_week_after_signing_in()
#number_of_items_purchased_from_particular_category_grouped_by_country('category_arg')
#number_of_items_purchased_in_particular_country_grouped_by_category('country_arg')
#number_of_purchased_items_grouped_by_categories_in_all_countries()
#number_of_purchased_items_after_seeing_campaigns_grouped_by_categories()
#average_number_of_purchased_items_during_the_first_month_after_signing_in()