In [92]:
import math
import statistics
import numpy
import csv
from scipy.stats import poisson
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
from scipy.stats import kendalltau
from matplotlib.ticker import FuncFormatter, MultipleLocator
import matplotlib.cm as cm
from matplotlib.widgets import Slider
import matplotlib.ticker as ticker
import plotly.plotly as py
import cufflinks as cf
import plotly.graph_objs as go
py.sign_in('PythonAPI', 'ubpiol2cve')

In [99]:
global_users = pd.read_csv('../../fb2015/users.csv', header=None).sample(frac=0.01)
global_users.columns = ['userId', 'registerCountry', 'signupTime']
global_conv = pd.read_csv('../../fb2015/conversions.csv', header=None).sample(frac=0.01)
global_conv.columns = ['userId', 'itemId', 'price', 'quantity', 'timestamp']
global_items = pd.read_csv('../../fb2015/items.csv', header=None).sample(frac=0.01)
global_items.columns = ['itemId', 'style', 'personality', 'color', 'theme', 'price', 'category']
global_users_ads = pd.read_csv('../../fb2015/users_ads.csv', header=None).sample(frac=0.01)
global_users_ads.columns = ['userId', 'utmSource', 'utmCampaign', 'utmMedium', 'utmTerm', 'utmContent']

Obtains data from files.

In [29]:
def drop_nulls(table, columns):
    table = table.dropna(subset=columns)
    for col in columns:
        table = table[~table[col].isin(['None'])]
    return table

Drops NaN, None values from given data.

In [None]:
def number_of_new_users_per_month():
    users = drop_nulls(global_users, ['signupTime', 'userId'])

    users['month'] = pd.DatetimeIndex(users['signupTime']).month
    users['year'] = pd.DatetimeIndex(users['signupTime']).year

    grouped = users.groupby(['year', 'month'])['userId'].count()
    sortedd = grouped.sort_index()

    ax = sortedd.plot(kind='bar')

    ax.set_xlabel('Month')
    ax.set_ylabel('Number of new users')
    ax.set_title("Number of new users per month.", fontsize=18)

    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Displays number of newly registered users per month.

In [None]:
def number_of_conversions_per_month():
    conv = drop_nulls(global_conv, ['timestamp'])

    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year

    grouped = conv.groupby(['year', 'month']).count()
    sortedd = grouped.sort_index()

    ax = sortedd.plot(legend=None)
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of conversions')
    ax.set_title("Number of conversions per month.", fontsize=18)

    plt.xticks(rotation=70)
    plt.tight_layout()

    plt.show()

Displays number of conversions per month.

In [None]:
def number_of_items_purchased_per_month():
    conv = drop_nulls(global_conv, ['timestamp', 'quantity'])

    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year

    grouped = conv.groupby(['year', 'month'])['quantity'].sum()
    sortedd = grouped.sort_index()

    ax = sortedd.plot(kind='bar')
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of items')
    ax.set_title("Number of items purchased per month.", fontsize=18)

    plt.xticks(rotation=70)
    plt.tight_layout()

    plt.show()

Displays number of items purchased per month.

In [None]:
def revenue_per_month():
    conv = drop_nulls(global_conv, ['timestamp', 'price'])

    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    grouped = conv.groupby(['year', 'month'])['price'].sum()
    sortedd = grouped.sort_index()

    ax = sortedd.plot() 
    ax.set_xlabel('Month')
    ax.set_ylabel('Revenue')
    ax.set_title("Revenue per month.", fontsize=18)

    plt.xticks(rotation=70)
    plt.tight_layout()

    plt.show()

Displays income per month.

In [None]:
def number_of_purchases_per_user_in_the_first_week_after_registration():
    #does not include users without purchase in the first week after registration
    users = drop_nulls(global_users, ['userId', 'signupTime'])
    conv = drop_nulls(global_conv, ['userId', 'quantity', 'timestamp'])

    users['signupTime'] =  pd.DatetimeIndex(users['signupTime'])
    users['week_after'] = pd.DatetimeIndex(users['signupTime'])+pd.Timedelta(days=7)
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])

    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    grouped = joined[joined['timestamp'] <= joined['week_after']].groupby('userId')['quantity'].sum()

    ax = grouped.plot(kind='bar')
    ax.set_xlabel('User id')
    ax.set_ylabel('Quantity')
    ax.set_title("Number of items purchased in the first week after registration, grouped by userId.", fontsize=18)

    plt.xticks(rotation=70)
    plt.tight_layout()

    plt.show()

Displays number of items purchased during the first week after signing in.

In [None]:
def number_of_items_purchased_from_particular_category_grouped_by_country(category):
    category = 2346301904
    conv = drop_nulls(global_conv, ['quantity'])
    items = drop_nulls(global_items, ['category'])
    users = drop_nulls(global_users, ['registerCountry'])

    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')

    filtered = joined[joined['category']==category]
    grouped = filtered.groupby('registerCountry')['quantity'].sum()

    ax = grouped.plot(kind='bar')
    ax.set_xlabel('Category')
    ax.set_ylabel('Quantity')
    ax.set_title("Number of items from "+str(category)+" category, grouped by country.", fontsize=18)

    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Displays number of items from particular category purchased by people from all countries.

In [None]:
def number_of_items_purchased_in_particular_country_grouped_by_category(country):
    country = 'United States'
    conv = drop_nulls(global_conv, ['quantity'])
    items = drop_nulls(global_items, ['category'])
    users = drop_nulls(global_users, ['registerCountry'])

    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')

    filtered = joined[joined['registerCountry']==country]
    grouped = filtered.groupby('category')['quantity'].sum()

    ax = grouped.plot(kind='bar')
    ax.set_xlabel('Category')
    ax.set_ylabel('Quantity')
    ax.set_title("Number of items purchased in " + country + ", grouped by category.", fontsize=18)

    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Displays number of items from all cattegories purchased in particular country.

In [15]:
def number_of_purchased_items_grouped_by_categories_in_all_countries():
    conv = drop_nulls(global_conv, ['quantity'])
    items = drop_nulls(global_items, ['category'])
    users = drop_nulls(global_users, ['registerCountry'])

    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')

    y_labels = []
    x_labels = []
    y_ticks = []
    x_ticks = []

    for i, row in enumerate(joined['registerCountry'].unique()):
        y_labels.append(row)
        y_ticks.append(i)

    for i, row in enumerate(joined['category'].unique()):
        x_labels.append(row)
        x_ticks.append(i)

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    for j, country in enumerate(y_labels):
        r = np.random.rand(1)
        color = cm.rainbow(np.linspace(r, r+1, 1))       
        for i, category in enumerate(x_labels):
            countryFiltered = joined[joined['registerCountry']==country]
            filtered = countryFiltered[countryFiltered['category']==category]
            area=filtered['quantity'].sum()
            ax.scatter(x=i, y=j, s=area, c=color, alpha=0.7)  

    ax.set(xticks=range(len(x_labels)), xticklabels=x_labels,
       yticks=range(len(y_labels)), yticklabels=y_labels)

    ax.set_xlabel('Category')
    ax.set_ylabel('Country')
    ax.set_title('Sum of purchased items, grouped by country and category.', fontsize=18)

    plt.tick_params(axis='x')
    plt.xticks(rotation=90)
    plt.tight_layout()

    plt.show()

Displays general view on sum of purchased items grouped by country and category.

In [4]:
def number_of_purchased_items_after_seeing_campaigns_grouped_by_categories():
    conv = drop_nulls(global_conv, ['timestamp'])
    users_ads = drop_nulls(global_users_ads, ['utmCampaign'])
    users = drop_nulls(global_users, ['signupTime'])
    items = drop_nulls(global_items, ['category'])

    users['week_after'] = pd.DatetimeIndex(users['signupTime'])+pd.Timedelta(days=7)
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])

    joined = pd.merge(users, users_ads, left_on='userId', right_on='userId', how='inner')
    joined = pd.merge(joined, conv, left_on='userId', right_on='userId', how='inner')
    joined = pd.merge(joined, items, left_on='itemId', right_on='itemId', how='inner')

    filtered = joined[joined['timestamp'] <= joined['week_after']]

    y_labels = []
    x_labels = []
    y_ticks = []
    x_ticks = []
    x = []
    y = []

    for i, row in enumerate(filtered['utmCampaign'].unique()):
        y_labels.append(row)
        y_ticks.append(i)
        y.append(i)

    for i, row in enumerate(filtered['category'].unique()):
        x_labels.append(row)
        x_ticks.append(i)
        x.append(i)

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    for j, utmCampaign in enumerate(y_labels):
        r = np.random.rand(1)
        color = cm.rainbow(np.linspace(r, r+1, 1))       
        for i, category in enumerate(x_labels):
            utmCampaignFiltered = filtered[filtered['utmCampaign']==utmCampaign]
            categoryFiltered = utmCampaignFiltered[utmCampaignFiltered['category']==category]
            area=categoryFiltered['quantity'].sum()
            ax.scatter(x=i, y=j, s=area, c=color, alpha=0.7)  

    ax.set(xticks=range(len(x_labels)), xticklabels=x_labels,
       yticks=range(len(y_labels)), yticklabels=y_labels)

    ax.set_xlabel('Category')
    ax.set_ylabel('utmCampaign')
    ax.set_title('Sum of purchased items in the first week after signing in, grouped by campaigns and category.', fontsize=18)

    plt.tick_params(axis='x', labelsize=8)
    plt.xticks(rotation=70)
    plt.tight_layout()

    plt.show()

  mask |= (ar1 == a)
  if self._edgecolors == str('face'):


Displays general view on sum of purchased items in the first week after signing in, connected with campaigns being seen, grouped by category and campaign.

In [2]:
def average_number_of_purchased_items_during_the_first_month_after_signing_in():
    conv = drop_nulls(global_conv, ['timestamp', 'quantity'])
    users = drop_nulls(global_users, ['signupTime', 'userId'])

    users['signupTime'] = pd.DatetimeIndex(users['signupTime'])
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])

    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    joined['purchase_day'] = pd.to_timedelta(joined['timestamp'])-pd.to_timedelta(joined['signupTime'])

    filtered = joined[joined['purchase_day']>=pd.Timedelta(days=0)]
    filtered = filtered[filtered['purchase_day']<pd.Timedelta(days=30)]
    filtered['purchase_day'] = pd.DatetimeIndex(filtered['purchase_day']).day

    aver = filtered.groupby(['purchase_day', 'userId'])['quantity'].sum()

    summ = [0 for x in range(0,31)]
    countt = [0 for x in range(0,31)]
    average = [0 for x in range(0,31)]

    for x in aver.iteritems():
        day = x[0][0]
        quantitySum = x[1]
        summ[day] += quantitySum
        countt[day] += 1

    for y in range(0, 31):
        if countt[y] != 0:
            average[y] = summ[y]/countt[y]

    ax = sns.barplot(y=average, x=[i for i in range(0, 31)])
    ax.set_xlabel('Day after signing in')
    ax.set_ylabel('Average quantity')
    ax.set_title('Average sum of purchases (quantities) during first month after signing in.', fontsize=18)

    plt.tick_params(axis='x', labelsize=8)
    plt.tight_layout()

    plt.show()

Displays average sum of purchases during first month after signing in.

In [6]:
def distribution_number_of_purchases_per_user_in_the_first_week_after_registration():
    users = drop_nulls(global_users, ['signupTime'])
    conv = drop_nulls(global_conv, ['timestamp', 'quantity'])

    users['signupTime'] =  pd.DatetimeIndex(users['signupTime'])
    users['week_after'] = pd.DatetimeIndex(users['signupTime'])+pd.Timedelta(days=7)
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])

    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    filtered = joined[joined['timestamp'] <= joined['week_after']]
    grouped = filtered.groupby('userId')['quantity'].sum()

    d = dict()
    for x in grouped:
        if x not in d.keys():
            d[x] = 1
        else:
            d[x] += 1

    d[0] = len(joined[~joined['userId'].isin(filtered['userId'].unique())].groupby('userId'))

    layout = go.Layout(
        title='Distribution of sum of purchases per user during first week after registration',
        xaxis=dict(
            title='Number of purchased products',
            titlefont=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        ),
        yaxis=dict(
            title='Number of users',
            titlefont=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    )
    data = [go.Bar(x=list(d.keys()), y=list(d.values()))]
    fig = go.Figure(data=data, layout=layout)
    py.plot(fig, filename='pandas-bar-chart')

'https://plot.ly/~PythonAPI/1438'

Shows distribution of how many items users bought during first week after sign up.

In [17]:
def probability_of_purchase_during_the_first_month_after_registration():
    conv = drop_nulls(global_conv, ['timestamp', 'quantity'])
    users = drop_nulls(global_users, ['signupTime'])

    users['signupTime'] = pd.DatetimeIndex(users['signupTime'])
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])

    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    joined['purchase_day'] = pd.to_timedelta(joined['timestamp'])-pd.to_timedelta(joined['signupTime'])

    nr_of_users = len(joined.groupby(['userId']))

    filtered = joined[joined['purchase_day']>=pd.Timedelta(days=0)]
    filtered = filtered[filtered['purchase_day']<pd.Timedelta(days=30)]
    filtered['purchase_day'] = pd.DatetimeIndex(filtered['purchase_day']).day

    purch = filtered.groupby(['purchase_day'])['userId'].count()
    probabil = [0 for x in range(0,31)]

    for x in purch.iteritems():
        day = x[0]
        probabil[day] = x[1]/nr_of_users

    ax = sns.barplot(y=probabil, x=[i for i in range(0, 31)])
    ax.set_xlabel('Day after registration')
    ax.set_ylabel('Probability (sum of purchases/number of users who have at least one purchase)')
    ax.set_title('Probability of purchase during the first month after registration', fontsize=18)

    plt.tick_params(axis='x', labelsize=8)
    plt.tight_layout()

    plt.show()


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison


axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.



Shows how probably is user's purchase during the first month after sign up.

In [30]:
#def poisson_distribution_revenue_per_month():
conv = drop_nulls(global_conv, ['timestamp', 'price'])

conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
grouped = conv.groupby(['year', 'month'])['price'].sum()
aver = grouped.mean()

print(aver)

print(numpy.random.poisson(lam=aver*12))
print("++++++++")
#print(poisson.pmf(aver*24, aver*12))
for i in range(1, 24):
    print("i:"+str(i))
    print(np.random.normal( aver*12,  math.sqrt(aver*i)))
print("++++++++")
for lambd in grouped:
    values = []
    print(lambd)
    #for k in range(1, len(grouped)+1):
    #values.append((math.pow(lambd, k)*math.exp(-lambd))/math.factorial(k))
    #print("  "+str(k))
    print(numpy.random.poisson(lam=lambd))#, size=len(grouped)))
#sortedd = grouped.sort_index()

#ax = sortedd.plot() 
#ax.set_xlabel('Month')
#ax.set_ylabel('Income')
#ax.set_title("Income per month.", fontsize=18)

#plt.xticks(rotation=70)
#plt.tight_layout()

#plt.show()

KeyboardInterrupt: 

In [40]:
def number_of_active_users_per_month():
    users = global_users
    conv = drop_nulls(global_conv, ['timestamp'])

    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year

    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    grouped = joined.groupby(['year', 'month'])['userId'].count()
    sortedd = grouped.sort_index()

    ax = sortedd.plot(kind='bar')

    ax.set_xlabel('Month')
    ax.set_ylabel('Number of active users')
    ax.set_title("Number of active users per month.", fontsize=18)

    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

Number of users who have at least one purchase in particular month.

In [None]:
#def variance_sum_of_revenue_for_items_purchased_during_first_month_after_signup():
users = drop_nulls(global_users, ['userId'])
conv = drop_nulls(global_conv, ['timestamp'])

conv['conv_month'] = pd.DatetimeIndex(conv['timestamp']).month
conv['conv_year'] = pd.DatetimeIndex(conv['timestamp']).year

users['signup_month'] = pd.DatetimeIndex(users['signupTime']).month
users['signup_year'] = pd.DatetimeIndex(users['signupTime']).year

joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
grouped = joined.groupby(['conv_year', 'conv_month', 'userId'])['price'].sum()
grouped = grouped.sort_index()
signup_grouped = users.groupby(['signup_year', 'signup_month'])['userId'].count()
signup_grouped = signup_grouped.sort_index()
for i, x in enumerate(signup_grouped):
    if i != 0:
        signup_grouped[i] += signup_grouped[i-1]

signed = []
for i in range(0, 3):
    signed.append([])
    for j in range(0, 13):
        signed[i].append(0)

print("len="+str(len(signup_grouped)))
print(len(signed[0]))
for i, year_id in enumerate(signup_grouped.index.labels[0]):
    month_id = signup_grouped.index.labels[1][i]
    year = signup_grouped.index.levels[0][year_id]
    month = signup_grouped.index.levels[1][month_id]
    print("y:"+str(year-2013)+" m:"+str(month)+" i:"+str(i))
    signed[year-2013][month] = signup_grouped[i]
    
res = []
for i in range(0, 3):
    res.append([])
    for j in range(0, 13):
        res[i].append(0)

#count variance for each month
m = -1
y = -1
data = []
for i, year_id in enumerate(conv_grouped.index.labels[0]):
    month_id = grouped.index.labels[1][i]
    year = grouped.index.levels[0][year_id]
    month = grouped.index.levels[1][month_id]
    if(i != 0 and (y != year or m != month)) or (i == len(grouped.index.labels[0])-1):
        if i == len(grouped.index.labels[0])-1:
            data.append(grouped[i])
        if signed[y-2013][m] > 0:
            data.extend([0] * (signed[y-2013][m] - len(data)))
            res[y-2013][m] = statistics.variance(data)
        data = []

    data.append(grouped[i])
    y = year
    m = month

variance = []
labels = []                                                 
for i in range(0, 3):
    for j in range(0, 13):
        variance.append(res[i][j])
        labels.append(str(j)+" / "+str(i+2013))
                                                 
ax = sns.barplot(y=variance, x=labels)
ax.set_xlabel('Year and month')
ax.set_ylabel('Variance')
ax.set_title('Variance: sum of prices of purchases for each user in each month after sign up', fontsize=18)

plt.tick_params(axis='x', labelsize=10)
plt.xticks(rotation=70)
plt.tight_layout()

plt.show()