In [3]:
import math

class Artist:
    def __init__(self, attributes):
        # All attributes of artists:
        # artist_name,artist_id,danceability,energy,valence,tempo,
        # loudness,mode,key,acousticness,instrumentalness,liveness,
        # speechiness,duration_ms,popularity,count
        ##The attributes are stored in a list, and can be traced 
        ##by the fixed index

        self.artist_name = attributes[0]
        self.artist_id = attributes[1]
        self.attributes = attributes[2:]
        return None
        
    def add_song(self, song_attribute):
        # ((current song number * current attributes + new attributes)
        # /(current song number + 1)), then update this artist's attribute.
        for i in range(len(song_attribute) - 1):
            self.attributes[i] = ((self.attributes[i] * self.attributes[-1]
                                   + song_attribute[i]) / (self.attributes[-1] + 1))
        self.attributes[-1] += 1
        return None
    
    def similarity_measure(self, other_artist):
        ##Take each artist as a multi-dimensional vector with each attribute 
        ##being a dimension. The cosine similarity algorithm measures 
        ##similarity of two vectors.

        def dot_product(v1, v2):
            return sum(x * y for x, y in zip(v1, v2))

        def magnitude(vector):
            return math.sqrt(sum(x * x for x in vector))

        def cosine_similarity(v1, v2):
            return dot_product(v1, v2) / (magnitude(v1) * magnitude(v2))
        
        ####

        return cosine_similarity(self.attributes, other_artist.attributes)

In [6]:
class ArtistArray:
    def __init__(self):
        ##initialize the number of artists 
        ##and the list for storing artists
        self.artists = []
        self.num = 0
        return None

    def add_artist(self, artist):
        ##add artist to list if it is not in the list
        if artist not in self.artists:
            self.artists.append(artist)
            self.num += 1
        return None

    def get_artist_by_name(self, name):
        ##get an artist from the stored list by their name
        for i in self.artists:
            if i.artist_name == name:
                return i

    def get_artist_by_id(self, id):
        ##get an artist from the stored list by their id
        for i in self.artists:
            if i.artist_id == id:
                return i

    def compare_two_artists(self, artist_id_a, artist_id_b):
        ##get the Artist and call similarity measure to compare
        a = self.get_artist_by_name(artist_id_a)
        b = self.get_artist_by_id(artist_id_b)
        a.similarity_measure(b)

In [36]:
# open and read in the file
import pandas as pd

# CSV file
file_path = 'influence_data.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

##convert the dataframe into a list of dictionaries
influence_data = df.to_dict('records')

# create an oriented graph from csv file
class Node:
    def __init__(self, id, active_start):
        self.id = id
        self.active_start = active_start
        self.next_nodes = set()

    def add_next_node(self, node):
        self.next_nodes.add(node)

class Graph:
    def __init__(self):
        ##a dictionary, d[data] = node(data), d[data].next_nodes = set of next nodes
        self.nodes = {}

    def get_or_create_node(self, id, active_start):
        if id not in self.nodes:
            self.nodes[id] = Node(id, active_start)
        return self.nodes[id]

    def add_edge(self, from_id, from_active_start, to_id, to_active_start):
        from_node = self.get_or_create_node(from_id, from_active_start)
        to_node = self.get_or_create_node(to_id, to_active_start)
        from_node.add_next_node(to_node)


graph = Graph()
##convert csv data into a graph by adding directed edges to the graph
for line in influence_data:
    graph.add_edge(line['influencer_id'], line['influencer_active_start'],\
                   line['follower_id'], line['follower_active_start'])


class ProblemSolver:
    def __init__(self, graph, influence_data):
        self.graph = graph
        self.relationship = {}
        self.data = influence_data

    def build_relationship(self, depth):
        ##build relationship dictionary from graph
        ##considering multiple layer relationship
        def build_relationship(from_node, to_node, depth):
            if depth > 0:
                if from_node not in self.relationship:
                    self.relationship[from_node] = set()
                self.relationship[from_node].add(to_node)
                for followers in to_node.next_nodes:
                    build_relationship(from_node, followers, depth - 1)
            else:
                return

        for init_node in self.graph.nodes:
            for to_node in self.graph.nodes[init_node].next_nodes:
                build_relationship(init_node, to_node, depth - 1)##decide depth
            
    def find_most_influential(self):
        ##traverse through the relationship dictionary,
        ##find whose follower set is the largest
        most_influence = None
        max_influence = 0
        for influencer in self.relationship.keys():
            if len(self.relationship[influencer]) > max_influence:
                max_influence = len(self.relationship[influencer])
                most_influence = influencer
        return most_influence

    def find_most_influential_years(self):
        ##for the followers starting from 2000, find the year that influenced them, count
        influence_years = {}
        for node in self.graph.nodes:
            for follower in self.graph.nodes[node].next_nodes:
                if follower.active_start == 2000:
                    influence_year = self.graph.nodes[node].active_start
                    if influence_year not in influence_years:
                        influence_years[influence_year] = 0
                    influence_years[influence_year] += 1
        
        sorted_influence_years = sorted(influence_years.items(), key=lambda x: x[1])
        return sorted_influence_years[-1][0],sorted_influence_years[-2][0], sorted_influence_years[-3][0]


    def find_non_influencers(self):
        ##traverse file, put all influencers in a set, all followers in another
        ##for every follower in follower set, see if it is in influencer set
        influencer_set = set()
        follower_set = set()
        non_influencers = set()
        for line in self.data:
            influencer_set.add(line['influencer_id'])
            follower_set.add(line['follower_id'])
        for follower in follower_set:
            if follower not in influencer_set:
                non_influencers.add(follower)
        return len(non_influencers)

    def find_shortest_influence_chain(self, start_data, end_data):
        ##BFS
        ##graph is a dictionary, d[data] = node(data), d[data].next_nodes = set of next nodes
        visited = set()
        queue = [(start_data, [start_data])]  # Using a tuple (current_node, path_so_far)

        while queue:
            current_data, path = queue.pop(0)

            if current_data == end_data:
                return path  # Found the shortest path

            if current_data not in visited:
                visited.add(current_data)
                node = self.graph.nodes[current_data]

                for next_node in node.next_nodes:
                    queue.append((next_node.id, path + [next_node.id]))

        return None  # No path found

problem_solver = ProblemSolver(graph, influence_data)
problem_solver.build_relationship(4)
most_influential = problem_solver.find_most_influential()#James Brown influenced others the most
influential_years = problem_solver.find_most_influential_years()#year 1990, 1980, 1970 influenced 2000 artists most
non_influencers_count=problem_solver.find_non_influencers()#1829 non-influencers
shortest_chain = problem_solver.find_shortest_influence_chain(1163, 601430)#enter 2 artist id to get shortest chain
print('Most influential artist:', most_influential)
print('Most influential years:', influential_years)
print(f"Number of non-influencers:{non_influencers_count}")
print(f"Shortest influence chain from 1163 to 601430: {shortest_chain}")

Most influential artist: 128099
Most influential years: (1990, 1980, 1970)
Number of non-influencers:1829
Shortest influence chain from aa to bb: [1163, 542549, 757665, 601430]
