In [352]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
from scipy.stats import kendalltau

In [353]:
def get_data(location):
    data = pd.read_csv(location, header=None)
    data = data.sample(frac=0.05)
   
    if location.endswith('users.csv'):
        data.columns = ['userId', 'registerCountry', 'signupTime']
    elif location.endswith('conversions.csv'):
        data.columns = ['userId', 'itemId', 'price', 'quantity', 'timestamp']
    elif location.endswith('items.csv'):
        data.columns = ['itemId', 'style', 'personality', 'color', 'theme', 'price', 'category']
    elif location.endswith('users_ads.csv'):
        data.columns = ['userId', 'utmSource', 'utmCampaign']
    return data


Obtains data from files.

In [354]:
def getYearMonth(s):
    return s.split("-")[1]+"-"+s.split("-")[2].split(" ")[0]

Util function: extracts months from dates.

In [355]:
 def number_of_new_users_per_month():
    users = get_data('../../fb2015/users.csv')
    users['month'] = pd.DatetimeIndex(users['signupTime']).month
    users['year'] = pd.DatetimeIndex(users['signupTime']).year
    users_per_yearmonth = users.groupby(['year', 'month'])['userId'].count()
    users_per_yearmonth = users_per_yearmonth.sort_index()
    users_per_yearmonth.plot(kind='bar')
    plt.show()

Displays number of newly registered users per month.

In [356]:
def number_of_conversions_per_month():
    conv = get_data('../../fb2015/conversions.csv')
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    conv_per_yearmonth = conv.groupby(['year', 'month']).count()
    conv_per_yearmonth = conv_per_yearmonth.sort_index()
    conv_per_yearmonth.plot()
    plt.show()

Displays number of conversions per month.

In [357]:
def number_of_items_purchased_per_month():
    conv = get_data('../../fb2015/conversions.csv')
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    conv_per_yearmonth = conv.groupby(['year', 'month'])['quantity'].count()
    conv_per_yearmonth = conv_per_yearmonth.sort_index()
    conv_per_yearmonth.plot()
    plt.show()

Displays number of items purchased per month.

In [358]:
def income_per_month():
    conv = get_data('../../fb2015/conversions.csv')
    conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
    conv['year'] = pd.DatetimeIndex(conv['timestamp']).year
    conv_per_yearmonth = conv.groupby(['year', 'month'])['price'].sum()
    conv_per_yearmonth = conv_per_yearmonth.sort_index()
    conv_per_yearmonth.plot()
    plt.show()

Displays income per month.

In [359]:
def number_of_purchases_per_user_in_the_first_week():
    users = get_data('../../fb2015/users.csv')
    users['week_after'] = pd.DatetimeIndex(users['signupTime'])+pd.Timedelta(days=7)
    
    conv = get_data('../../fb2015/conversions.csv')
    conv['timestamp'] = pd.DatetimeIndex(conv['timestamp'])
    
    joined = pd.merge(users, conv, left_on='userId', right_on='userId', how='inner')
    conversions_per_user_in_the_first_week = joined[joined['timestamp'] <= joined['week_after']].groupby('userId')['quantity'].sum()
    
    conversions_per_user_in_the_first_week.plot(kind='bar')
    plt.show()

Displays number of items purchased during the first week after signing in.

In [360]:
def number_of_items_from_particular_category_purchased_in_all_countries():
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    
    joined = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    count_of_items = joined.groupby('category')['quantity'].sum()
    #count_of_items.columns = ['category', 'sum']
    
    count_of_items.plot(kind='bar', title='Number of items purchased in all countries, grouped by category.')
    plt.show()

Displays number of items from particular category purchased by people from all countries.

In [361]:
def number_of_items_purchased_in_particular_country_grouped_by_category(country):
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    users = get_data('../../fb2015/users.csv')
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
    filtered = joined[joined['registerCountry']==country]
    grouped = filtered.groupby('category')['quantity'].sum()
    
    grouped.plot(kind='bar', title="Number of items purchased in " + country + ", grouped by category.")
    plt.show()

Displays number of items from all cattegories purchased in particular country.

In [362]:
def number_of_purchased_items_grouped_by_categories_in_all_countries():#TODO
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    users = get_data('../../fb2015/users.csv')
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
 
    #joined = joined[joined['category'] != 'NaN'].to_frame()
    #joined = joined[joined['quantity'] != 'NaN'].to_frame()
    #joined = joined[joined['registerCountry'] != 'NaN'].to_frame()
    joined['category'] = joined.category.dropna()
    joined['quantity'] = joined.quantity.dropna()
    joined['registerCountry'] = joined.registerCountry.dropna()
    #joined['quantitySum'] = joined.groupby(['registerCountry', 'category'])['quantity'].sum()
    #print(joined['quantitySum'])
    #print("+++++++++++")
    #for x in joined.groupby(['registerCountry', 'category'])['quantity'].sum():
    #    print(x)
    #print("+++++++++++")
    print(joined.groupby(['registerCountry', 'category'])['quantity'].sum())
    g = sns.FacetGrid(joined, col="registerCountry")
    g.map(sns.barplot, "category", "quantity");

    plt.show()

In [363]:
def number_of_all_items_grouped_by_categories_in_all_countries2():#TODO
    conv = get_data('../../fb2015/conversions.csv')
    items = get_data('../../fb2015/items.csv')
    users = get_data('../../fb2015/users.csv')
    
    joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
    joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
    #joined = joined.groupby('registerCountry')#['grouped_by_country'] = 
    #joined = joined.groupby('category')#['quantity'].sum()#['grouped_by_categoried']
    #print(type(grouped_by_country))
    #print(type(grouped_by_categoried))
    sns.jointplot(x='signupTime', y='category', data=joined, kind="hex", color="#4CB391", ratio=13)
    #kdeplot(joined['signupTime'], joined['category'])
    plt.show()

In [364]:
#def neww():
#    sns.set(style="darkgrid")
#    tips = sns.load_dataset("tips")
#    g = sns.FacetGrid(tips, row="sex", col="time", margin_titles=True)
#    bins = np.linspace(0, 60, 13)
#    g.map(plt.hist, "total_bill", color="steelblue", bins=bins, lw=0)
#    plt.show()

In [365]:
#def neww2():
#    sns.set(style="ticks")

#    rs = np.random.RandomState(11)
#    x = rs.gamma(12, size=1000)
#    y = -.5 * x + rs.normal(size=1000)

#    sns.jointplot(x, y, kind="hex", stat_func=kendalltau, color="#4CB391")
#    plt.show()

In [366]:
number_of_purchased_items_grouped_by_categories_in_all_countries()
#number_of_items_purchased_in_particular_country_grouped_by_category('United States')
#number_of_items_from_particular_category_purchased_in_all_countries()

registerCountry  category  
None             1230687117    1
                 2095064690    4
United States    48494886      6
                 463366985     3
                 489875200     3
                 525094781     1
                 938176607     1
                 938803198     1
                 1230687117    3
                 1242167586    1
                 1723305630    1
                 2095064690    7
                 2325837962    1
                 3308992943    1
                 3462847467    1
                 3751981041    1
                 3772617607    1
                 4129351864    7
                 4245010542    1
Name: quantity, dtype: int64




AttributeError: bins must increase monotonically.

In [None]:
 #grouped = joined.groupby('category')['quantity'].sum()
#summ = joined.groupby('category')['quantity'].sum()
#sns.set(style="darkgrid")
#g = sns.FacetGrid(joined, row='category', col='registerCountry', margin_titles=True)#, col="registerCountry"
#bins = np.linspace(0, 60, 13)
#g.map(plt.hist, 'quantity', color="steelblue", bins=bins, lw=0)



#joined_items_conv = pd.merge(items, conv, left_on='itemId', right_on='itemId', how='inner')
#joined = pd.merge(joined_items_conv, users, left_on='userId', right_on='userId', how='inner')
#joined['sum'] = joined.groupby('category', 'quantity')
#joined = joined.groupby('registerCountry')#['grouped_by_country'] = 
#joined = joined.groupby('category')#['quantity'].sum()#['grouped_by_categoried']
#print(type(grouped_by_country))
#print(type(grouped_by_categoried))
#sns.jointplot(x='sum', y='category', data=joined, kind="hex", color="#4CB391", ratio=13)

#kind='hist'))#(color='DarkGreen', legend=None)#kind='pie', 'subplots=True')#)#kind='scatter', 
#for x in count_of_items:
#    print(x)
#print(type(count_of_items))
#print(count_of_items.axes)
#print("++++++++++++++++++++++")
#print(count_of_items['sum'])
#x = np.random.rand(10)
#y = np.random.rand(count_of_items['itemId'])
#conversions_per_user = conv.groupby(['userId'])['timestamp', 'quantity']
#print(conversions_per_user_in_the_first_week)
#pd.DatetimeIndex(users['signupTime']).month
#users['month'] = pd.DatetimeIndex(users['signupTime']).month
#users['year'] = pd.DatetimeIndex(users['signupTime']).year
#print(users)

#conv['month'] = pd.DatetimeIndex(conv['timestamp']).month
#conv['year'] = pd.DatetimeIndex(conv['timestamp']).year

#conversions_per_user = pd.concat([users, conv], axis=1, join='inner')


#conversions_per_user = pd.concat([users, conv], axis=1, join='inner')

#conversions_per_user = conv[conv.timestamp < users.signupTime]

#for x in conversions_per_user:
#    print(x)
#print(conversions_per_yearmonth)

#conv = get_data('fb2015/conversions_part.csv')
#users = get_data('fb2015/users_part.csv')
#result = pd.concat([users, conv], axis=1, join='inner')

#data = get_data('fb2015/users.csv')
#data['signupTime'] = pd.to_datetime(data['signupTime'])#month
#data['yearmonth'] = data['signupTime'].apply(lambda x: getYearMonth(x))
#data.set_index(['signupTime'])
#print(data['signupTime'].month)
#ts = data.groupby(['yearmonth'].sum()#groupby([lambda x: x.year, lambda x: x.month]).sum()
#ts.plot()
#plt.show() 
#print(ts)
#dates.groupby()
#
#print(data['yearmonth'])
#ts = data['yearmonth'].cumsum()
#ts = data['signupTime'].groupby([lambda x: x.year, lambda x: x.month]).sum()
#df = pandas.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))

#In [120]: rows = random.sample(df.index, 10)
#rindex =  np.array(sample(xrange(len(data)), 10))
#rows = random.sample(data.index, 10)

#data = data.ix[rows]

# get 10 random rows from df
#data = data.ix[rindex]