In [None]:
#!pip install kaggle
import os
import networkx as nx
import pandas as pd
import numpy as np
from pathlib import Path
import zipfile
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
import random
import timeit
import itertools
import matplotlib.image as mpimg
import urllib.request

Data from: https://www.kaggle.com/leomauro/smmnet

# Datenbeschaffung & Aufbereitung

## Setup kaggle api

In [None]:
# Create .kaggle path in user dir if not exists
if not os.path.exists(str(Path.home())+"\.kaggle"):
    os.makedirs(str(Path.home())+"\.kaggle")

Go to www.<span>kaggle.<span>com/**your_username**/account and download a new api-token. 
After downloading the json file needs to be put into the C:\Users\\**username**\\.kaggle directory.

## Download dataset

In [None]:
# Check if folder already exists
if not os.path.isdir('./data'):
    
    # if not create data folder
    os.makedirs('./data') 
    
    # download csv's
    !kaggle datasets download -d "leomauro/smmnet" -p "./data"

## Unzip files

In [None]:
file_list = ["clears.csv","course-meta.csv","courses.csv","likes.csv","players.csv","plays.csv","records.csv"]
files_exist = []

# Check if all csv's exist
for file in file_list:
    files_exist.append(os.path.isfile("./data/" + file))
    
# if not create csv's
if not all(files_exist):
    with zipfile.ZipFile("./data/smmnet.zip", 'r') as zip_ref:
        zip_ref.extractall("./data") 

## Import files as Pandas DataFrame

In [None]:
clears = pd.read_csv("./data/clears.csv", delimiter = "\t")
course_meta = pd.read_csv("./data/course-meta.csv", delimiter = "\t") # index_col=["id","firstClear"]
courses = pd.read_csv("./data/courses.csv", delimiter = "\t")
likes = pd.read_csv("./data/likes.csv", delimiter = "\t")
players = pd.read_csv("./data/players.csv", delimiter = "\t")
plays = pd.read_csv("./data/plays.csv", delimiter = "\t")
records = pd.read_csv("./data/records.csv", delimiter = "\t")

# Data Validation

# EDA

### Welche Nationalitäten sind unter den Spielern am häuffingsten vertreten?

In [None]:
def player_nationalities(players, top_n):
    nationalities = players.copy()
    nationalities["count"] = 1
    nationalities = nationalities.groupby("flag").count()["count"].sort_values(ascending=False)
    top_nationalities = nationalities[0:top_n].append(pd.Series(nationalities[top_n:].sum()))
    new_index = list(top_nationalities.index)
    new_index[-1] = "Others"
    top_nationalities.index=new_index

    plt.bar(x=top_nationalities.index, height=top_nationalities, color="#FF1E2D")
    plt.title("Nr of Players by Nationality")
    plt.xlabel("Country")
    plt.ylabel("count [n]")
    plt.show()
    
    return top_nationalities

print(player_nationalities(players, top_n=10))

Im Datensatz befinden sich zu je einem Drittel US-Amerikanische und Japanische Spieler. Danach folgen Frankreich (5.6%), Deutschland (5.4) und Kanada (5.2%). Insgesammt enthällt der Datensatz 884302 Spielerprofile aus 82 verschidenen Nationen.

### Wie sieht die Schwierigkeitsverteilung der kreierten Levels aus?

In [None]:
def level_difficilties():
    course_difficulties = courses.groupby("difficulty").count()
    colors = ["#2CB01A","#FBD000", "#FF1E2d", "#000000"]
    plt.bar(x=["easy", "normal", "expert", "superExpert"], height=course_difficulties["id"], color = colors)
    plt.title("Nr of Levels by Difficulty")
    plt.xlabel("Difficulty")
    plt.ylabel("count [n]")
    plt.show()
    
level_difficilties()

Die einzelnen Levels können einer von vier ordinalen Schwierigkeitsstufen zugeordnet werden. Die Reihenfolge in aufsteigender Schwierigkeit lautet: easy < normal < expert < superExpert. Die relativen Anteile der Kathegorien lauten: Zu 26 Prozent easy, 24 Prozent normal, 45 Prozent hard und zu 5 Prozent superHard.


# Social Network Analysis

## Player-Level interaction

The dataset contains the two tables "players" which contains all players and the table "plays" which contains all interactions of these players with a level. If we merge these two datasets together, we are able to create a network of all interactions between players and levels

### Create Graph

In [None]:
#merge players df with plays df
player_game_interactions = pd.merge(players, plays.rename(columns={"id": "level_id"}), how="left", left_on="id", right_on="player")

player_level_interactions = nx.Graph()

player_nodes = list(player_game_interactions["id"].unique())
level_nodes = list(player_game_interactions["level_id"].unique())
edges = player_game_interactions[["player", "level_id"]].values.tolist()

player_level_interactions.add_nodes_from(player_nodes)
player_level_interactions.add_nodes_from(level_nodes)
player_level_interactions.add_edges_from(edges)

In [None]:
print(nx.info(player_level_interactions))
print("Average level_node degree:",np.mean(list(dict(player_level_interactions.degree(level_nodes)).values())))
print("Average player_node degree:",np.mean(list(dict(player_level_interactions.degree(player_nodes)).values())))

The graph has a total of 999'335 nodes (players + levels) and 3'941'379 edges. The average node degree is 7.89. Due to the large number of nodes, it is not possible to visualize the complete graph for reasons of overview and lack of computing power. However, we can examine the graph more closely for it's attributes.

### Node degrees 
Die Node-Degrees der einzelnen Spieler entspricht der Anzahl an gespielten Levels. In einem ersten Versuch wollten wir die node degrees mit der in networkx vorhandenen Funktion "bipartite.degrees()" bestimmen. Das dauerte jedoch zu lange.

In [None]:
def plot_player_degrees_nx(player_game_interactions):
    '''Calculates node degrees from a bipartie graph and viszalizes them using a histogramm'''
    # Braucht zu lange.
    deg_levels, deg_players = bipartite.degrees(player_level_interactions, player_nodes)
    deg_players = dict(deg_players)
    plt.bar(list(deg_players.keys()), deg_players.values(), color="#3944BC")
    plt.show()

Alternativ wurden die nodedegrees deshalb direkt aus dem ursprünglichen Pandas-Dataframe errechnet.

In [None]:
def plot_player_degrees_pd(player_game_interactions):
    '''Calculates node degrees and viszalizes them using a histogramm'''
    data = player_game_interactions.groupby("id").count().reset_index().rename(columns={"player": "interactions"})

    plt.figure(figsize=(12, 6))
    plt.hist(data["interactions"], bins=range(min(data["interactions"]), max(data["interactions"]) + 1, 1), color="#3944BC")
    plt.yscale('log')
    plt.title("Distribution of level interactions per player")
    plt.xlabel("Nr of level interactions")
    plt.ylabel("count [n]")
    plt.show()
    return data.drop(columns=["image","flag","name","catch","level_id"]).sort_values("interactions", ascending=False).reset_index(drop=True)
    
node_degrees = plot_player_degrees_pd(player_game_interactions)
print("Quantiles:")
print(node_degrees["interactions"].quantile([0,0.25,0.5,0.75,1]))

Im Histogramm sehen wir, dass die meisten Spieler nur mit sehr wenigen Levels interagiert haben. 75 Prozent der Spieler haben vier oder weniger Levels gespielt. Der mediane Node Degree ist nur bei 2. Es gibt aber einen kleinen anteil an Spielern, welche mit sehr vielen Levels interagiert haben. Als nächstes suchen wir nach allen Spielern welche mehr als 1000 Levelinteraktionen haben.

In [None]:
print("10 most interactive players:\n")
print(node_degrees.loc[node_degrees["interactions"]>=1000])

Der Aktivste Spieler hat insgesammt 2681 unterschiedliche Levels gespielt. Insgesammt befinden sich im Datensatz 12 Spieler welche mit mindestens 1000 Levels interagiert haben.

### Local View Top 12 active players two-mode

In [None]:
def top_n_players(players, plays, n):
    '''selects n players with most level interactions and returns a dataframe containing these level interactions and players'''
    #merge players df with plays df
    player_game_interactions = pd.merge(players, plays.rename(columns={"id": "level_id"}), how="left", left_on="id", right_on="player")
    #Level interaction of top n interacting players
    top_n_players = player_game_interactions.groupby("player").count().nlargest(n,'id').reset_index()["player"].tolist()
    n_player_interactions = player_game_interactions.loc[player_game_interactions['player'].isin(top_n_players)].reset_index(drop=True)
    return n_player_interactions

n = 12
n_player_interactions = top_n_players(players, plays, n)

In [None]:
big_n = nx.Graph()

player_nodes = list(n_player_interactions["player"].unique())
level_nodes = list(n_player_interactions["level_id"].unique())
edges = n_player_interactions[["player", "level_id"]].values.tolist()


big_n.add_nodes_from(player_nodes)
big_n.add_nodes_from(level_nodes)
big_n.add_edges_from(edges, weight=0.5)

pos = nx.drawing.layout.spring_layout(big_n, seed=28)

plt.figure(figsize=(20, 20))
ax = plt.gca()
ax.set_title("Level interactions of " + str(n) + " most interacting players")

nx.draw_networkx_nodes(big_n, pos, nodelist=level_nodes, node_color="#FF1E2D", node_size=1)  
nx.draw_networkx_nodes(big_n, pos, nodelist=player_nodes, node_color="#3944BC", node_size=8)                                                                                                        
                                                                                                      
nx.draw_networkx_edges(big_n, pos)

# plt.savefig("Level interaction of " + str(n) + " most interacting players.png", dpi=800, format="PNG")

### Local View Top 12 active players one-mode

In [None]:
weighted_projection = bipartite.weighted_projected_graph(big_n, player_nodes)
print(weighted_projection)

plt.figure(figsize=(12, 12))
ax = plt.gca()
ax.set_title('Level interaction of random 500 players')
pos = nx.drawing.layout.random_layout(weighted_projection, seed = 10)
# nx.draw_networkx_nodes(weighted_projection, pos, node_color="#3944BC")   
# nx.draw_networkx_labels(weighted_projection, pos)

edge_labels=dict([((u,v,),d['weight'])
             for u,v,d in weighted_projection.edges(data=True)])

nx.draw(weighted_projection, pos, with_labels=True, connectionstyle='arc3', node_color = "#3944BC", font_weight='heavy')
nx.draw_networkx_edge_labels(weighted_projection, 
                             pos, edge_labels=edge_labels, 
                             label_pos=0.5, font_size=10, 
                             font_weight='heavy', 
                             font_color="#3944BC", 
                             bbox=dict(facecolor='white',edgecolor='none',alpha=1, pad=0.0))

plt.draw()
plt.show()

In [None]:
edge_weights =dict(edge_labels)
{k: v for k, v in sorted(edge_weights.items(), key=lambda edge_weights: edge_weights[1], reverse=True)}

In [None]:
def random_n_players(players, plays, n):
    #merge players df with plays df
    player_game_interactions = pd.merge(players, plays.rename(columns={"id": "level_id"}), how="left", left_on="id", right_on="player")
    # Filter for random n players
    n_random_players = random.sample(set(player_game_interactions["player"]), n)
    n_player_interactions = player_game_interactions.loc[player_game_interactions['player'].isin(n_random_players)].reset_index(drop=True)
    return n_player_interactions

n_player_interactions = random_n_players(players, plays, int(len(players)/100))
n_player_interactions.head()

In [None]:
random_n = nx.Graph()

player_nodes = list(n_player_interactions["player"].unique())
level_nodes = list(n_player_interactions["level_id"].unique())
edges = n_player_interactions[["player", "level_id"]].values.tolist()

random_n.add_nodes_from(player_nodes)
random_n.add_nodes_from(level_nodes)
random_n.add_edges_from(edges)

pos = nx.drawing.layout.spring_layout(random_n, seed=98)

plt.figure(figsize=(25, 25))
ax = plt.gca()
ax.set_title('Level interaction of random 1 percent of players')

nx.draw_networkx_nodes(random_n, pos, nodelist=level_nodes, node_color="#FF1E2D", node_size=1)  
nx.draw_networkx_nodes(random_n, pos, nodelist=player_nodes, node_color="#3944BC", node_size=1)                                                                                                        
                                                                                                      
nx.draw_networkx_edges(random_n, pos)



plt.savefig('./images/Level interaction of random 1 percent of players.png', dpi=800, format="png")
plt.draw()
plt.show()

## Likes Network

In [None]:
print("The \"Likes\" table contains " + str(len(likes)) + " likes from " + str(len(likes["player"].unique())) + " different players")

Als erstes möchten wir untersuchen, wie viele Spieler im Datensatz auch tatsächlich likes bei levels hinterlassen. Um das zu erreichen müssen die Likes pro Spieler gezählt werden. Die Anzahl Likes pro Spieler entsprechen im folgenden Graph dann den Node-Degrees der Spieler.

In [None]:
player_likes = likes[["player","id"]].groupby("player").count().reset_index().rename(columns={"id":"levelsLiked"})
player_likes = pd.merge(players, player_likes, how="left", left_on="id", right_on="player").drop(columns="player")
player_likes["levelsLiked"] = player_likes["levelsLiked"].fillna(0)

rating_player_ratio = len(player_likes.loc[player_likes["levelsLiked"]!=0]) / len(player_likes)
print("Has rated: " + str(round(rating_player_ratio*100, 2)) + "%")
print("Not rated: " + str(100-round(rating_player_ratio*100, 2)) + "%")

Von allen Spielern im Datensatz haben 20,6 Prozent (Jeder 5te Spieler) mindestens ein Level bewertet. Entsprechend hinterlässt die Mehrheit der Spieler, 80 Prozent keine Bewertungen. Um ein genaueres Bild der Node-degrees aller Spieler im "Likes Netzwerk" zu erhalten werden diese im Nächsten Schritt als Histogramm angezeigt.

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(player_likes["levelsLiked"], bins=range(int(min(player_likes["levelsLiked"])), int(max(player_likes["levelsLiked"])) + 1, 1),color="#3944BC")
plt.yscale('log')
plt.title("Distribution of Nr of level liked by players")
plt.xlabel("likes")
plt.ylabel("players [n]")
plt.show()

print("Quantiles:")
print(player_likes["levelsLiked"].quantile([0,0.25,0.5,0.75,0.8,0.9,0.95,0.99,0.999,1]))

Wenn wir das Histogramm betrachten, dann sehen wir die Verteilung der Anzahl Levelbewertungen pro Spieler. Wir erkennen nun noch deutlicher, dass ein grosser Teil keine oder nur wenige Likes abgegeben hat. Es gibt jedoch einen sehr kleinen Anteil an Spielern welche sehr viel Bewertet haben. Der Spieler mit den meisten Likes hat insgesammt 3365 Levels geliked. Diese Zahl ist grösser wir die maximale Anzahl an gespielten Levels und kann dadurch erklährt werden, dass ein Level nicht gespielt werden muss, damit der Spieler ein Like hinterlassen kann. Als nächstes möchten wor das Like netzwerk als Graph generieren.

### Pre-filtering
Aus effizienzgründen werden Spieler mit nur wenigen Likes herausgefiltert.

In [None]:
def minimal_likes(df, n):
    # input: Likes table
    # output Likes table with only players having at least n likes
    min_likes = df.groupby("player").count().rename(columns={"catch": "likes"}).drop(columns="id")
    min_likes = min_likes.loc[min_likes["likes"] >= n].sort_values("likes").index.tolist()
    df = df.loc[df["player"].isin(min_likes)].sort_values("player").reset_index(drop=True)
    return df

likes_50 = minimal_likes(likes, 50)

### Graph generation

In [None]:
def generate_bi_graph(df, nodes_1, nodes_2):
    '''Generates a two node network'''
    G = nx.Graph()
    
    n_1 = list(df[nodes_1].unique())
    n_2 = list(df[nodes_2].unique())
    edges = df[[nodes_1, nodes_2]].values.tolist()
    
    G.add_nodes_from(n_1)
    G.add_nodes_from(n_2)
    G.add_edges_from(edges)
    return G, n_1, n_2


G_likes, player_nodes, level_nodes = generate_bi_graph(likes_50, "player", "id")

plr, lvl = bipartite.sets(G_likes)
print("Nr of Players (nodes):", len(plr))
print("Nr of Levels (nodes):", len(lvl))
print("Nr of likes (edges)", len(G_likes.edges))

Der two-nodes Graph "G_likes" besteht aus insgesammt 1326 Spieler welche insgesammt 210842 Likes bei 51346 verschiedene Levels hinterlegt haben. In einem nächsten schritt wird dieser Graph in ein one-mode Netzwerk umgewandelt, wobei die Knoten die einzelnen Spieler repräsentieren. Die Kantengewichte zwischen zwei Spielern beschreiben das Verhältnis der levels welche von beiden Knoten gemeinsam geliked wurden, errechnet mit dem Jaccard-Index: $w_{u,v} = \frac{N(u)\cap N(u)}{N(u)\cup N(u)}$

In [None]:
if not os.path.isfile("./data/pickle/G_similar_likes.gpickle"):
    # Transform to one node network
    G_similar_likes = bipartite.overlap_weighted_projected_graph(G_likes, player_nodes) #overlap using jaccard coefficient of neighborhoods
    G_similar_likes.edges()
    # Save as pickle
    nx.write_gpickle(G_similar_likes, "./data/pickle/G_similar_likes.gpickle") 
else:
    # Load file
    G_similar_likes = nx.read_gpickle("./data/pickle/G_similar_likes.gpickle")

# edge_labels=dict([((u,v,),round(d['weight'],3)) for u,v,d in G_similar_likes.edges(data=True)])

# # Draw network
# plt.figure(figsize=(10, 10))
# ax = plt.gca()
# ax.set_title('Proportion of same liked levels')
# pos = nx.drawing.layout.random_layout(G_similar_likes)

# nx.draw(G_similar_likes, pos, with_labels=True, connectionstyle='arc3', node_color = "#3944BC", font_weight='heavy')
# nx.draw_networkx_edge_labels(G_similar_likes, 
#                              pos, edge_labels=edge_labels, 
#                              label_pos=0.5, font_size=10, 
#                              font_weight='heavy', 
#                              font_color="#3944BC", 
#                              bbox=dict(facecolor='white',edgecolor='none',alpha=0, pad=0.0))

      
# plt.draw()
# plt.show()

print("Nr of Players (nodes):", len(G_similar_likes.nodes))
print("Nr of similarities (edges)", len(G_similar_likes.edges))

### Filter for players with similarities > 0.2

Da wir nach Spielern mit gemeinsamen Interessen suchen macht es Sinn, Kannten unter einem gewissen Schwellenwert zu eliminieren, da diese Spieler eine zu kleine Gemsinsamkeit zueinander haben. Wir definieren diesen Schwellenwert hier auf 0.2 = mind. 20 Prozent der Levels welche Zwei Spieler geliked haben müssen gleich sein, ansonsten gelten sie nicht als ähnlich und die Kante wird eliminiert. Dadurch erhalten wir den folgenden Graph:

In [None]:
# Load grahp
G_similar_likes = nx.read_gpickle("./data/pickle/G_similar_likes.gpickle")

# remove edges with edge weight < 0.2
edge_weights = nx.get_edge_attributes(G_similar_likes,'weight')
G_similar_likes.remove_edges_from((e for e, w in edge_weights.items() if w <0.2))

# remove isolated nodes
G_similar_likes.remove_nodes_from(list(nx.isolates(G_similar_likes))) 

# Create edge labels
edge_labels=dict([((u,v,),round(d['weight'],2)) for u,v,d in G_similar_likes.edges(data=True)])

# Draw graph
plt.figure(figsize=(20, 20))
ax = plt.gca()
ax.set_title('Proportion of same liked levels')
pos = nx.drawing.layout.spring_layout(G_similar_likes, seed=1)
nx.draw(G_similar_likes, pos, with_labels=True, connectionstyle='arc3', node_color = "#3944BC", font_weight='heavy', font_size=3, node_size=50)
nx.draw_networkx_edge_labels(G_similar_likes, 
                             pos, edge_labels=edge_labels, 
                             label_pos=0.55, font_size=3, 
                             font_weight='heavy', 
                             font_color="#3944BC", 
                             bbox=dict(facecolor='white',edgecolor='none',alpha=1, pad=0.0))

plt.draw()
plt.show()

In [None]:
edge_weights =dict(edge_labels)
similarities = {k: v for k, v in sorted(edge_weights.items(), key=lambda edge_weights: edge_weights[1], reverse=True)}
plt.hist(similarities.values(), bins=20)
similarities.values()

for i in range(5):
    print(list(similarities.keys())[i], similarities[list(similarities.keys())[i]])

print("\nQuantile", "Value")
for i, j in enumerate(np.quantile(list(similarities.values()), np.array(range(0,11,1))/10)):
    print(i/10,"    ", j)

### Visualizing connected components
Wir können dieses Netzwerk nun auf verschiedene Attribute untersuchen. Um dieses noch übersichtlicher darzustellen werden die einzelnen Connected Components unterschiedlich einfärben.

In [None]:
colors = ["tomato","skyblue","limegreen","gold","slateblue","turquoise","cornflowerblue",
          "hotpink","yellowgreen","orchid","peru","lightsalmon","thistle","darkkhaki",
          "lightsteelblue","orange","forestgreen","slategrey"]*round(nx.number_connected_components(G_similar_likes)/18+0.5)

# Draw graph
plt.figure(figsize=(20, 20))
ax = plt.gca()
ax.set_title('Proportion of same liked levels')
pos = nx.drawing.layout.spring_layout(G_similar_likes, seed=1)
for i, component in enumerate(nx.connected_components(G_similar_likes)):
    nx.draw(G_similar_likes, pos, with_labels=True, nodelist=component, connectionstyle='arc3', node_color=colors[i], font_size=2, node_size=50)
    for j in component:
        edge_labels=dict([((u,v,),round(d['weight'],3)) for u,v,d in G_similar_likes.edges(data=True) if (u == j) or (v == j)])
        nx.draw_networkx_edge_labels(G_similar_likes, 
                             pos, edge_labels=edge_labels, 
                             label_pos=0.55, font_size=2, 
                             font_weight='heavy', 
                             font_color=colors[i], 
                             bbox=dict(facecolor='white',edgecolor='none',alpha=1, pad=0.0))
               
plt.savefig("./images/likes_connected_components.png", dpi=800, format="PNG")


plt.draw()
plt.show()

In [None]:
print("Nr of connected components:", nx.number_connected_components(G_similar_likes))
cpnt = [len(c) for c in sorted(nx.connected_components(G_similar_likes), key=len, reverse=True)]
print("Component sizes:", cpnt)
print("Giant component ratio:", round(cpnt[0]/sum(cpnt),3))

Das neue gefilterte Netzwerk besteht aus insgesammt 45 Connected Comonents. Der "Giant component" besteht aus 140 Spielern und beinhaltet knapp 50 Prozent aller Knotem im gefilterten Netzwerk.

In [None]:
components = [c for c in sorted(nx.connected_components(G_similar_likes), key=len, reverse=True)]

In [None]:
Gcc = sorted(nx.connected_components(G_similar_likes), key=len, reverse=True)
G0 = G_similar_likes.subgraph(Gcc[0])


plt.figure(figsize=(14, 14))
ax = plt.gca()
ax.set_title('Proportion of same liked levels')
pos = nx.drawing.layout.spring_layout(G0, seed=45)

edge_labels=dict([((u,v,),round(d['weight'],2)) for u,v,d in G0.edges(data=True)])



nx.draw(G0, pos, with_labels=True, connectionstyle='arc3', node_color = "#3944BC", font_weight='heavy', font_size=10)

nx.draw_networkx_edge_labels(G0, 
                             pos, edge_labels=edge_labels, 
                             label_pos=0.55, font_size=3, 
                             font_weight='heavy', 
                             font_color="#3944BC", 
                             bbox=dict(facecolor='white',edgecolor='none',alpha=1, pad=0.0))


plt.draw()
plt.show()

### Recommending levels based on cliques

Wenn wir nun annehmen, dass Spieler die ein ähnliches Like-Verhalten besitzen auch ähnliche interessen haben, so kann es sich lohnen nach Levels zu suchen, welche vom Ausgangsknoten noch nicht gespielt wurden, jedoch von vielen direkten Nachbarn "geliked" wurden. Diese könnten dann als potentielle Levelempfehlungen für den jeweiligen Spieler verwendet werden.

Die Funktion get_cliques(G, n) erzeugt ein dictionary mit allen möglichen cliques mit n Knoten.
Die Funktion get_player_cliques(clique_list, player) filtert dann nach cliques eines bestimmten Knotens.
Die Funktion popular_amomg_clique_members(likes, clique_list, player) sucht anschliessent pro Clique nach levels welche von allen Knoten ausser dem Aussgangsknoten geliked wurden und gibt diese Cliqueübergreiffend als Liste zurück.
Die funktion not_jet_played(plays, level_list, player) filtert nun noch alle Levels heraus, welche vom Spieler laut "plays" bereits gespielt wurden und gibt eine Liste zurück, welche potentielle Levelempfehlungen beinhaltet. 

Die Mindestanzahl an Likes die ein Level von direkt benachbarten Knoten benötigt um potenziell empfohlen zu werden passt sich der Cliquengrösse an und entspricht der Cliquengrösse - 1.

In [None]:
def get_cliques(G, n):
    '''returns all possible cliques with n nodes of graph G'''
    cliques_G = nx.find_cliques(G)
    return set(sum([list(itertools.combinations(set(clq), n)) for clq in cliques_G if len(clq)>=n],[]))

def get_player_cliques(clique_list, player):
    '''takes a list of cliques and returns cliques with a specified player'''
    return [clq for clq in clique_list if player in clq]

def popular_amomg_clique_members(likes, clique_list, player):
    '''returns all levels that all other clique members of a player have liked except the player'''
    recommended = []
    player_cliques = get_player_cliques(clique_list, player)
    for clq in player_cliques:
        popular = likes.loc[likes["player"].isin(clq)].groupby("id").count()
        popular = list(popular.loc[popular["catch"] == len(clq)-1].index)
        already_played = list(likes.loc[likes["player"]==player]["id"])
        new_levels = [level for level in popular if level not in already_played]
        return list(set().union(recommended, new_levels))

def not_jet_played(plays, level_list, player):
    '''returns levels from a given list that the player has not played jet'''
    played_levels = list(plays.loc[plays["player"]==player]["id"])
    return [lvl for lvl in level_list if lvl not in played_levels]


player = "cowboye123"
clique_size = 7

cliques = get_cliques(G_similar_likes, clique_size)
print("Nr of " + str(clique_size) + "-node cliques: " + str(len(cliques)))
promising_levels = popular_amomg_clique_members(likes, cliques, "cowboye123")
if promising_levels==None:
    print("No levels found for "+ str(player) +" with at least " + str(clique_size-1) +" likes. Try smaller cliques.")
else:
    recommendation = not_jet_played(plays, promising_levels, "cowboye123")
    print("Recommended levels for " + player + ":\n", recommendation)

## Important Level creators

In [None]:
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox
from matplotlib.cbook import get_sample_data
import hvplot.networkx as hvnx
import holoviews as hv

def rank_creators(courses, course_meta, col="players"):
    '''counts likes, players and created courses by "maker"'''
    df = pd.merge(courses, course_meta, how="left", on="id")
    df_lc = (courses.groupby("maker").count().reset_index().rename(columns={"id":"levels"})[["maker", "levels"]])
    df = df.groupby("maker").sum().sort_values(by=[col],ascending=False).reset_index()
    df = pd.merge(df, df_lc, how="left", on="maker")
    return df

def download_profile_pics(df, namelist):
    """downloads and saves profile images for players in namelist"""
    df = df.copy()
    for playername in namelist:
        image = df.loc[df["id"]==playername]["image"].reset_index(drop=True)[0]
        urllib.request.urlretrieve(str(image), "./profiles/" + str(playername) + ".png")

def top_creators_bar(df, column, amount, players):
    '''Creates a barplot to show most active creators'''
    # prepare dataframe
    df = df.copy()
    df = df.sort_values(by=column, ascending=False).reset_index()
    df=df.head(amount)
    # Check if all creator images exist
    files_exist = []
    for file in df["maker"]:
        files_exist.append(os.path.isfile("./profiles/" + file + ".png"))
    if not all(files_exist):
        print("Download images")
        download_profile_pics(players, df["maker"])
    # Create plot
    fig, ax = plt.subplots(figsize=(10, 6))
    image_coords = np.array(df[["maker",column]])
    images = np.array(df["maker"])
    ax.bar(x=df["maker"], height=df[column])
    for i in image_coords:
        xy = i
        arr_img = plt.imread('./profiles/'+str(i[0])+'.png', format='png')
        imagebox = OffsetImage(arr_img, zoom=0.5)
        ax.plot(xy[0], xy[1])
        imagebox.image.axes = ax
        ab = AnnotationBbox(imagebox, xy,
                            xybox=(0, 30),
                            xycoords='data',
                            boxcoords="offset points",
                            pad=0.0,
                            arrowprops=dict(
                                alpha=0,
                                arrowstyle="-",
                                connectionstyle="angle,angleA=0,angleB=90,rad=0"),
                            bboxprops =dict(edgecolor="#FFFFFF"))
        ax.add_artist(ab)
    ax.set_ylim(0, max(df[column])*1.3)
    plt.title("Creators with most " + column)
    plt.xlabel("top creators")
    plt.ylabel(str(column) +  " [n]")
    plt.xticks(rotation=45)
    plt.show()

In [None]:


def filter_by_creators(courses, plays, creators):
    '''filters plays and courses df by creators and returns them as new dataframe'''
    levels = courses.loc[courses["maker"].isin(creators)]
    players = plays.loc[plays["id"].isin(levels["id"])]
    return levels, players

def creator_ego_network(levels_df, players_df):
    '''Visualizes a ego network of one creator with all its levels and its levels players'''
    # nodelists
    level_nodes = levels_df["id"].unique()
    creator_nodes = levels_df["maker"].unique()
    player_nodes = players_df["player"].unique()
    # edgelists
    level_by_creator = levels_df[["id","maker"]].values.tolist()
    player_by_level = players_df[["id","player"]].values.tolist()
    BG = nx.Graph()
    # add nodes here
    BG.add_nodes_from(creator_nodes, node_type="Creator", count=1)
    BG.add_nodes_from(level_nodes, node_type="Levels",count=1)
    BG.add_nodes_from(player_nodes, node_type="Players",count=1)
    # add edges here
    BG.add_edges_from(level_by_creator)
    BG.add_edges_from(player_by_level)
    nodes = BG.nodes()
    # for each of the parts create a set 
    nodes_0  = set([n for n in nodes if  BG.nodes[n]['node_type']=="Creator"])
    nodes_1  = set([n for n in nodes if  BG.nodes[n]['node_type']=="Levels"])
    nodes_2  = set([n for n in nodes if  BG.nodes[n]['node_type']=="Players"])
    # Calculate ratio between nodelists for spacing
    spacing_1 = len(nodes_2)/(len(nodes_1)-1)
    # set the location of the nodes for each set
    pos = dict()
    pos.update( (n, (1, i)) for i, n in enumerate(nodes_0) ) # put nodes from X at x=1
    pos.update( (n, (2, i*spacing_1)) for i, n in enumerate(nodes_1) ) # put nodes from Y at x=2
    pos.update( (n, (3, i)) for i, n in enumerate(nodes_2) ) # put nodes from X at x=1
    node_color= len(nodes_0) * ["#1E90FF"] + len(nodes_1) * ["#00FA9A"] + len(nodes_2) * ["#FA8072"]
    p = hvnx.draw(BG, pos, node_color=node_color, label="Multipartite ego network of creator: {}".format(creator_nodes[0]), 
              node_size=40, edge_width=0.5, width=800, height=700, linewidths=0.2)  
    return p


def weighted_transformed_creator(weighted_levels_df):
    '''transforms the creator network to a weighted graph'''
    # nodelists
    creator_nodes = weighted_levels_df["maker"].unique()
    creator_weight = weighted_levels_df.groupby("maker").sum().reset_index()["players"][0]
    level_nodes = weighted_levels_df["id"].unique()
    level_weights = list(weighted_levels_df["players"])
    # edgelists
    level_by_creator = weighted_levels_df[["id","maker"]].values.tolist()
    BG = nx.Graph()
    # add nodes here
    BG.add_nodes_from(creator_nodes, node_type="Creator", players=creator_weight)
    BG.add_nodes_from(level_nodes, node_type="Levels", players=1)
    for i in range(len(level_nodes)):
        BG.nodes[level_nodes[i]]["players"]=level_weights[i]
    # add edges
    BG.add_edges_from(level_by_creator)
    # for each of the parts create a set 
    nodes = BG.nodes()
    nodes_0  = set([n for n in nodes if  BG.nodes[n]['node_type']=="Creator"])
    nodes_1  = set([n for n in nodes if  BG.nodes[n]['node_type']=="Levels"])
    # set the location of the nodes for each set
    pos = dict()
    pos.update( (n, (0, i)) for i, n in enumerate(nodes_0) ) # put nodes from X at x=1
    pos.update( (n, (1, (2/len(nodes_1))*(len(nodes_1)/2-i))) for i, n in enumerate(nodes_1) ) # put nodes from Y at x=2
    node_color= len(nodes_0) * ["#1E90FF"] + len(nodes_1) * ["#00FA9A"] 
    font_size = [2] * (len(nodes_0) + len(nodes_1))
    a = hvnx.draw(BG, pos, node_color=node_color, label="Weighted transformed network: bipartite layout   ->", 
              font_color="#000000", font_weight="normal", text_font_size="small", node_size="players", edge_width=0.5, width=400, height=700, 
              linewidths=0.2, labels="players")
    pos = nx.spring_layout(BG)
    b = hvnx.draw(BG, pos, node_color=node_color, label="     Spring layout", 
              font_color="#000000", font_weight="normal", text_font_size="small", node_size="players", edge_width=0.5, width=500, height=700, 
              linewidths=0.2, labels="players")
    return a + b


In [None]:
levels_df, players_df = filter_by_creators(courses, plays, ["Katzzzz"])
creator_ego_network(levels_df, players_df)

In [None]:
playercount = players_df.groupby("id").count().reset_index().rename(columns={"player":"players"}).drop(columns="catch")
weighted_levels_df = pd.merge(levels_df, playercount, how="left", on="id")
weighted_transformed_creator(weighted_levels_df)

In [None]:
df_top_creators = rank_creators(courses, course_meta, "players")

In [None]:
top_creators_bar(df_top_creators, "players", 10, players)

In [None]:
top_creators_bar(df_top_creators, "levels", 10, players)

In [None]:
top_creators_bar(df_top_creators, "stars", 10, players)

In [None]:
def overall_rank(df):
    '''Calculates an overall rank based on weighted players, stars and nr of levels'''
    df=df.copy()
    df["stars_rank"] = df["stars"].rank(ascending=False)
    df["players_rank"] = df["players"].rank(ascending=False) 
    df["levels_rank"] = df["levels"].rank(ascending=False)
    df["overall_rank"] = df["players_rank"] * df["stars_rank"] * df["levels_rank"]
    df["overall_rank"] = df["overall_rank"].rank()
    df = df.sort_values(by="overall_rank").reset_index(drop=True)
    df = df.drop(columns=["tweets","clears", "attempts", "clearRate"])
    return df
    
final_rank = overall_rank(df_top_creators)
final_rank.head(10)

In [None]:

img = plt.imread("./profiles/Monkeydelphin173.png")
N = 1

ax=plt.gca()
fig=plt.gcf()

trans = ax.transData.transform
trans2 = fig.transFigure.inverted().transform
imsize = 0.1 # this is the image size

(x,y) = pos["Monkeydelphin173"]
xx,yy = trans((x,y)) # figure coordinates
xa,ya = trans2((xx,yy)) # axes coordinates
a = plt.axes([xa-imsize/2.0,ya-imsize/2.0, imsize, imsize ])
a.imshow(img)
a.set_aspect('equal')
a.axis('off')
plt.show()

In [None]:
# plt.figure(figsize=(20, 20))


# level_nodes = levels_df["id"].unique()
# creator_nodes = levels_df["maker"].unique()
# player_nodes = players_df["player"].unique()

# level_by_creator = levels_df[["id","maker"]].values.tolist()
# player_by_level = players_df[["id","player"]].values.tolist()


# BG = nx.Graph()

# # add nodes here
# BG.add_nodes_from(level_nodes, bipartite=1)
# BG.add_nodes_from(creator_nodes, bipartite=0)
# BG.add_nodes_from(player_nodes, bipartite=2)

# # add edges here
# BG.add_edges_from(level_by_creator)
# BG.add_edges_from(player_by_level)


# nodes = BG.nodes()
# # for each of the parts create a set 
# nodes_0  = set([n for n in nodes if  BG.nodes[n]['bipartite']==0])
# nodes_1  = set([n for n in nodes if  BG.nodes[n]['bipartite']==1])
# nodes_2  = set([n for n in nodes if  BG.nodes[n]['bipartite']==2])

# # Calculate ratio between nodelists for spacing
# spacing_1 = len(nodes_2)/(len(nodes_1)-1)

# # set the location of the nodes for each set
# pos = dict()
# pos.update( (n, (1, i)) for i, n in enumerate(nodes_0) ) # put nodes from X at x=1
# pos.update( (n, (2, i*spacing_1)) for i, n in enumerate(nodes_1) ) # put nodes from Y at x=2
# pos.update( (n, (3, i)) for i, n in enumerate(nodes_2) ) # put nodes from X at x=1

# nx.draw_networkx_nodes(BG, pos, nodelist=creator_nodes, node_color="#FF1E2D", node_size=10)  
# nx.draw_networkx_nodes(BG, pos, nodelist=level_nodes, node_color="#00FF2D", node_size=10)  
# nx.draw_networkx_nodes(BG, pos, nodelist=player_nodes, node_color="#3944BC", node_size=10)          


# nx.draw_networkx_edges(BG, pos)

# plt.show()