In [9]:
import csv
import pandas
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime
import matplotlib
import seaborn as sns
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition

users = pandas.read_csv("data/users.csv", header=None)
conversions = pandas.read_csv("data/conversions.csv", header=None)
items = pandas.read_csv("data/items.csv", header=None)
users_ads = pandas.read_csv("data/users_ads.csv", header=None)
views = pandas.read_csv("data/views.csv", header=None, low_memory=False)

users.columns = ['userId', 'registerCountry', 'signupTime']
conversions.columns = ['userId', 'itemId', 'price', 'quantity', 'timestamp']
items.columns = ['itemId', 'style', 'personality', 'color', 'theme', 'price', 'category']
users_ads.columns = ['userId', 'utmSource', 'utmCampaign', 'utmMedium', 'utmTerm', 'utmContent']
views.columns = ['userId', 'itemId', 'timestamp', 'pagetype']

users.signupTime = pandas.to_datetime(users.signupTime)
conversions.timestamp = pandas.to_datetime(conversions.timestamp)
views.timestamp = pandas.to_datetime(views.timestamp)

In [10]:
import random
def random_subset(df, percent):
    new_size  = int(percent*len(df.index))
    subset = random.sample(set(df.index), new_size)
    return df.ix[subset]

users_small = random_subset(users, 0.2)
items_small = random_subset(items, 0.2)
users_ads_small = random_subset(users_ads, 0.2)
conversions_small = random_subset(conversions, 0.2)
# views_small = random_subset(views, 0.2)

In [11]:
full_info_conversions = conversions.merge(users, how='inner', on='userId')
full_info_conversions = full_info_conversions.merge(items, how='inner', on='itemId')
print(full_info_conversions.columns.values)

['userId' 'itemId' 'price_x' 'quantity' 'timestamp' 'registerCountry'
 'signupTime' 'style' 'personality' 'color' 'theme' 'price_y' 'category']


In [12]:
full_info_views_conversions = full_info_conversions.merge(views, how='inner', on='userId')
print(full_info_views_conversions.columns.values)

['userId' 'itemId_x' 'price_x' 'quantity' 'timestamp_x' 'registerCountry'
 'signupTime' 'style' 'personality' 'color' 'theme' 'price_y' 'category'
 'itemId_y' 'timestamp_y' 'pagetype']


Printing charts for people who had max k views and bought something

In [107]:
def mean_or_zero(series):
    if series.size == 0:
        return 0.0
    return series.mean()

def charts_max_k_views_bought_something(period_of_time_days, k):
    df = full_info_views_conversions[full_info_views_conversions.timestamp_x <= full_info_views_conversions.signupTime + pandas.Timedelta(period_of_time_days)]
    df['spending'] = df.price_y * df.quantity

    sns.set()
    f, (axes) = sns.plt.subplots(3, sharex=False, sharey=False)
    main_title = "Charts binned by number of initial views (max " + str(k) + " views) during first " + str(period_of_time_days) + " days after registration" 
    f.suptitle(main_title)
    f.suptitle(main_title)
    f.set_size_inches(15,30)
    f.tight_layout(pad=1, w_pad=1, h_pad=13)
    plt.subplots_adjust(top=0.91)
    
    sum_df = pandas.DataFrame()
    sum_df['userId'] = df.userId
    sum_df['quantity'] = df.quantity
    sum_df['spending']  = df.spending
    sum_df['number_views'] = np.ones(sum_df.spending.size)
    
    sum_df = sum_df.groupby('userId').sum()

    mean_spending_k=[]
    for i in range(k):
        mean_spending_k.append(mean_or_zero((sum_df[sum_df.number_views == i]).spending))
    mean_spending_k_series = pandas.Series(mean_spending_k, index=range(k))
    mean_spending_k_series.plot(ax=axes[0],kind='bar')
    axes[0].set_title("Average spending", fontsize=16)
    axes[1].set_xlabel("Number of views")
    axes[1].set_ylabel("Mean spending")
    
    mean_quantity_k=[]
    for i in range(k):
        mean_quantity_k.append(mean_or_zero((sum_df[sum_df.number_views == i]).quantity))
    mean_quantity_k = pandas.Series(mean_quantity_k, index=range(k))
    mean_quantity_k.plot(ax=axes[1],kind='bar')
    axes[1].set_title("Mean quantity of bought objects", fontsize=13)
    axes[1].set_xlabel("Number of views")
    axes[1].set_ylabel("Mean quantity")

    plt.show()

In [None]:
charts_max_k_views_bought_something(10, 20)