## Contents
1. [Import data](#Import-data)
1. [Experiment](#Experiment)
    1. [Descriptive statistics](#Descriptive-statistics)
    1. [Gini coefficient](#Gini-coefficient)
    1. [Market share](#Market-share)
    1. [Unpredictability](#Unpredictability)
2. [References](#References)

## Import data
We start by importing python libraries and configuring jupyter.

In [None]:
# Initialization
%pylab inline
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import scipy.misc as spmisc

Now we'll define helper functions to read data from either the original experiment or from oTree.

In [None]:
# These helper functions create a data frame with the following columns:
#  - world_id
#  - song_id
#  - download_count
#  - rating_count
#  - mean_rating

class SDWData(object):
    def __init__(self, path="external/CW", independent_world=9):
        self.path = path
        self.independent_world = 9
        
    def get_world_song(self):
        # Load data from all worlds in SDW experiment 1
        downloads_file = "{path}/musiclab_data/dynamics_downloads_download_w{world}_v{experiment}.txt"
        song_ids = range(1,49)
        world_ids = range(1,10)
        columns = ['user_id', 'world_id'] \
            + ["dl_{i}".format(i=i) for i in song_ids] \
            + ['timestamp']
        df_raw = pd.concat([
            pd.read_csv(
                downloads_file.format(path=self.path, world=w, experiment=1),
                comment="%",
                header=None,
                names=columns
            )
            for w in world_ids])
        return self.sdw_to_world_song(df_raw)

    # Convert SDW2006 data to a data frame with a single count column
    def sdw_to_world_song(self, df_raw):
        col_world_id = []
        col_song_id = []
        col_count = []
        song_ids = range(1,49)
        world_ids = range(1,10)
        # Get list of world ids present in df_raw
        world_ids = sorted(set(df_raw["world_id"]))
        for cur_world in world_ids:
            # Filter by world
            df_world = df_raw[df_raw["world_id"] == cur_world]
            for cur_song in song_ids:
                col_world_id.append(cur_world)
                col_song_id.append(cur_song)
                count = df_world["dl_{}".format(cur_song)].sum()
                col_count.append(count)
        df_downloads = pd.DataFrame({
            "world_id": col_world_id,
            "song_id": col_song_id,
            "download_count": col_count,
            "rating_count": 0.0,
            "mean_rating": 0.0
        })
        return df_downloads

class OTreeData(object):
    def __init__(self, data="data/cultural_market.csv", session=None, independent_world=0):
        self.data = data
        self.session = session
        self.independent = independent
        
    # Load oTree data into a data frame
    def get_world_song(self):
        # Read csv in oTree format
        df_raw = pd.read_csv(self.data)
        # Remove all but desired session
        df_raw = df_raw[df_raw['session.code'] == self.session]
        # Generate list of songs and worlds
        row_count = int(list(df_raw["player.rows"])[0])
        col_count = int(list(df_raw["player.cols"])[0])
        num_songs = row_count * col_count
        song_ids = range(num_songs)
        world_ids = sorted(set(df_raw["player.world"]))
        # Count totals for each world/song combination
        col_world_id = []
        col_song_id = []
        col_download_count = []
        col_mean_rating = []
        col_rating_count = []
        for cur_world in world_ids:
            df_world = df_raw[df_raw["player.world"] == cur_world]
            for cur_song in song_ids:
                # Record song and world id
                col_world_id.append(cur_world)
                col_song_id.append(cur_song)
                # Count the number of downloads
                col_download_count.append(df_world["player.download_{}".format(cur_song)].sum())
                # Find the number of ratings and average rating
                rating_label = "player.rating_{}".format(cur_song)
                df_ratings = df_world[df_world[rating_label] > 0]
                col_rating_count.append(len(df_ratings))
                col_mean_rating.append(df_ratings[rating_label].mean())
        df_downloads = pd.DataFrame({
            "world_id": col_world_id,
            "song_id": col_song_id,
            "download_count": col_download_count,
            "rating_count": col_rating_count,
            "mean_rating": col_mean_rating
        })
        return df_downloads

### Read the data into a data frame
The next line will read in the data from the original Salganik, Dodds, and Watts experiment. If you instead want to analyze data from oTree, remove the `#` from the beginning of the second line and enter the path to the csv file downloaded from oTree and the oTree session id of your experiment. If you change the data file, also make sure to change the id for the independent world.

In [None]:
data = SDWData(path="external/CM", independent_world=9)
#data = OTreeData(data="data/cultural_market.csv", session="iddr7oao", independent_world=0)
df_downloads = data.get_world_song()
df_downloads.head()

## Experiment
First we must choose which quantity to analyze. The options are `download_count` (as in the original experiment) or `mean_rating`.

In [None]:
analysis_column = "download_count"

### Descriptive statistics
First we define functions to count the total number of downloads for each song and in each world.

In [None]:
# Count downloads for each song
def count_song_stats(df_downloads):
    # Create list of song_id values present in input
    song_ids = sorted(set(df_downloads["song_id"]))
    # Create DataFrame for songs
    df_songs = pd.DataFrame({"song_id":song_ids}) \
        .set_index("song_id")
    df_songs["download_count"] = [
        df_downloads[df_downloads["song_id"] == cur_song]["download_count"].sum()
        for cur_song in df_songs.index]
    # Calculate mean over all worlds
    df_songs["mean_rating"] = np.zeros(len(df_songs.index))
    df_songs["rating_count"] = np.zeros(len(df_songs.index))
    for cur_song in df_songs.index:
        df = df_downloads[df_downloads["song_id"] == cur_song]
        df = df[~np.isnan(df["mean_rating"])]
        total_rating = np.dot(df["mean_rating"], df["rating_count"])
        total_count = df["rating_count"].sum()
        df_songs["rating_count"] = total_count
        df_songs["mean_rating"][cur_song] = total_rating / total_count
    return df_songs

# Count downloads for each world
def count_world_stats(df_downloads):
    # Create list of world_id values present in input
    world_ids = sorted(set(df_downloads["world_id"]))
    # Create DataFrame for worlds
    df_worlds = pd.DataFrame({"world_id":world_ids}) \
        .set_index("world_id")
    # Count downloads for each world
    df_worlds["download_count"] = [
        df_downloads[df_downloads["world_id"] == cur_world]["download_count"].sum()
        for cur_world in df_worlds.index
    ]
    # Calculate mean over all songs
    df_worlds["rating_count"] = np.zeros(len(df_worlds.index))
    df_worlds["mean_rating"] = np.zeros(len(df_worlds.index))
    for cur_world in df_worlds.index:
        df = df_downloads[df_downloads["world_id"] == cur_world]
        df = df[~np.isnan(df["mean_rating"])]
        total_rating = np.dot(df["mean_rating"], df["rating_count"])
        total_count = df["rating_count"].sum()
        df_worlds["rating_count"] = total_count
        df_worlds["mean_rating"][cur_world] = total_rating / total_count
    return df_worlds

In [None]:
# Plot histogram of download counts
df_songs = count_song_stats(df_downloads)
df_worlds = count_world_stats(df_downloads)
plt.hist([ x for x in df_songs[analysis_column] if not np.isnan(x)], bins=10)
plt.xlabel(analysis_column)
plt.ylabel("Frequency")

### Gini coefficient

In [None]:
def gini(x):
    '''Given a list of counts `x`, return the gini coefficient.'''
    x = [xi for xi in x if not np.isnan(xi)]
    n = len(x)
    gini_num = sum([sum([abs(x_i - x_j) for x_j in x]) for x_i in x])
    gini_den = 2.0 * n * sum([x_i for x_i in x])
    return gini_num / gini_den

In [None]:
# Calculate and plot the gini coefficient for each world
df_worlds["gini"] = [
    gini(df_downloads[df_downloads["world_id"] == cur_world][analysis_column])
    for cur_world in df_worlds.index]
plt.bar(df_worlds.index, df_worlds["gini"])
plt.xlabel("World")
plt.ylabel("Gini coefficient")

### Market share

In [None]:
# Calculate market shares
def find_market_share(df_downloads):
    '''Return a DataFrame containing song_id, world_id, and market_share columns.'''
    # Create a copy of the input to return
    df = df_downloads.copy()
    # Count the total downloads for each world
    df_worlds = count_world_stats(df)
    # Calculate the market share
    if analysis_column == 'mean_rating':
        df["market_share"] = [
            float(df.loc[i]["mean_rating"] * df.loc[i]["rating_count"]) \
                / float(df_worlds.loc[df.iloc[i]["world_id"]]["rating_count"])
                / float(df_worlds.loc[df.iloc[i]["world_id"]]["mean_rating"])
            for i in df.index]
    elif analysis_column == 'download_count':
        df["market_share"] = [
            float(df.loc[i]["download_count"]) \
                / float(df_worlds.loc[df.iloc[i]["world_id"]]["download_count"])
            for i in df.index]
    return df

In [None]:
# Calculate market share for each song/world
df_downloads["market_share"] = find_market_share(df_downloads)["market_share"]

# Copy market share, and convert to rank one world at a time
ranks = []
for cur_world in sorted(set(df_downloads["world_id"])):
    df = df_downloads[df_downloads["world_id"] == cur_world].copy()
    df["market_rank"] = df["market_share"].rank(ascending=False)
    # Store results for this world in an array
    ranks.append(df)
# Concatenate results for all worlds
df_downloads['market_rank'] = pd.concat(ranks)['market_rank']

# Remove nan entries
nan_songs = list(df_downloads[np.isnan(df_downloads["market_share"])]["song_id"])
df = df_downloads
for cur_song in nan_songs:
    df = df[df["song_id"] != cur_song]

In [None]:
# Create a figure
plt.figure(figsize(8,4))

# Plot social influence market share vs independent market share
# Create subplots and use first
plt.subplot(1,2,1)
world_ids = sorted(set(df_downloads["world_id"]))
dependent_worlds = [x for x in world_ids if x != data.independent_world]
for cur_world in dependent_worlds:
    plt.plot(
        df[df["world_id"] == data.independent_world]['market_share'],
        df[df["world_id"] == cur_world]['market_share'], '.b')
plt.xlabel("Market share (Indep.)")
plt.ylabel("Market share (Social)")

# Plot social rank vs indpendent rank in second subplot
plt.subplot(1,2,2)
for cur_world in dependent_worlds:
    plt.plot(
        df[df["world_id"] == data.independent_world]['market_rank'],
        df[df["world_id"] == cur_world]['market_rank'], '.b')
plt.xlabel("Market rank (Indep.)")
plt.ylabel("Market rank (Social)")
plt.tight_layout()


## Unpredictability

In [None]:
def find_unpredictability(df_downloads):
    '''Return a DataFrame indexed by song_id with an `unpredictability` column.'''
    # Create the DataFrame and index from the provided download data
    df = pd.DataFrame({"song_id": list(set(df_downloads["song_id"]))}) \
        .set_index("song_id")
    # Get market share of each song in each world
    df_market = find_market_share(df_downloads)
    # Calculate and return the unpredictability based on equation in SDW2006
    df["unpredictability"] = [
        sum([
            sum([
                abs(
                    df_market[
                        (df_market["song_id"] == cur_song)
                        & (df_market["world_id"] == world_j)
                    ]["market_share"].sum()
                    - df_market[
                        (df_market["song_id"] == cur_song)
                        & (df_market["world_id"] == world_k)
                    ]["market_share"].sum()
                ) / spmisc.comb(len(df_worlds), 2)
                for k, world_k in enumerate(df_worlds.index[j+1:])])
            for j, world_j in enumerate(df_worlds.index)])
        for cur_song in df.index]
    return df

In [None]:
# Calculate unpredictability in social influence worlds
df_social = df_downloads[df_downloads["world_id"] != 9]
df_songs["unpredictability"] = find_unpredictability(df_social)["unpredictability"]

# Calculate unpredictability in independent world
# First, copy data for only independent world
df_indep = df_raw[df_raw["world_id"] == independent_world].copy()
# Repeatedly split users into two random worlds and calculate uncertainties
unpredictability = []
num_iter = 50
for i in range(num_iter):
    df_indep["world_id"] = np.random.randint(0, 2, size=len(df_indep))
    df_indep_dl = raw_to_world_song(df_indep)
    u_i = find_unpredictability(df_indep_dl)["unpredictability"]
    unpredictability.append(u_i) 
# Average results
# Elements are pandas Series objects, which can be added to each other
u = np.sum(unpredictability, axis=0) / float(num_iter)
# Add to the song DataFrame
df_songs["unpredictability_indep"] = u

In [None]:
# Plot the unpredictability for social and independent worlds
u_social = df_songs["unpredictability"].sum() / float(len(df_songs))
u_indep = df_songs["unpredictability_indep"].sum() / float(len(df_songs))
plt.figure(figsize=(6,4))
plt.bar([1, 2], [u_social, u_indep])
plt.xlabel("World")
plt.ylabel("Unpredictability")


## References

1. Salganik, M. J., Dodds, P. S., & Watts, D. J. (2006). Experimental study of inequality and unpredictability in an artificial cultural market. _Science_, 311(5762), 854-856.

In [None]:
df_raw.sort_values("timestamp").head(10)

In [None]:
world_ids[-9:-1]