In [47]:
import numpy as np
import pandas as pd
import networkx as nx
import sklearn
import matplotlib.pyplot as plt

In [48]:
PATH = "../datasets/filtered-data/"
df_actors = pd.read_csv(PATH + "actors_actresses.csv")
df_movie = pd.read_csv(PATH + "filtered_movies.csv")

In [49]:
def find_movie_of_genre_year(genre, years):
    return df_movie[df_movie['genres'].str.contains(genre) & df_movie["startYear"].isin(years)]["tconst"]

In [50]:
class GenreGraph:
    
    def __init__(self, genre, start_year, end_year):
        self.genre = genre
        self.years = list(range(start_year, end_year))
        self.graph = self.create_graph()


    def create_graph(self):
        G = nx.Graph()
        for movie_id in find_movie_of_genre_year(self.genre, self.years):
            actors_in_movie = df_actors[df_actors['tconst'] == movie_id]['nconst'].tolist()
            for i in range(len(actors_in_movie)):
                for j in range(i + 1, len(actors_in_movie)):
                    actor1 = actors_in_movie[i]
                    actor2 = actors_in_movie[j]
                    if G.has_edge(actor1, actor2):
                        G[actor1][actor2]['weight'] += 1
                    else:
                        G.add_edge(actor1, actor2, weight=1)
        return G
    
    def print(self):
        pos = nx.spring_layout(self.graph, k=0.5)
        nx.draw(self.graph, pos, with_labels=True, node_size=500, font_size=12)
        edge_labels = nx.get_edge_attributes(self.graph, 'weight')
        nx.draw_networkx_edge_labels(self.graph, pos, edge_labels=edge_labels)
        plt.title(f'{self.genre} in {self.years}')
        plt.show()

    

In [51]:
def create_graphs(start_year=1915, year_range=3, end_year=2020, genre_list=["Action","Adventure","Sci-Fi","Drama","Fantasy","Romance","Horror","Comedy"]):
    X = []
    y = []
    for genre in genre_list:
        for year in range(start_year, end_year, year_range):
            print(f'genre: {genre}')
            print(f'year: {year}')
            X.append(GenreGraph(genre, year, year + year_range))
            y.append(genre)
    return X,y

In [76]:
def extract_features(graph):
    # Example feature: average degree
    avg_degree = np.mean([d for n, d in graph.degree()])
    
    # Example feature: clustering coefficient
    avg_clustering = nx.average_clustering(graph)
    
    # Example feature: degree centrality
    degree_centrality = np.mean(list(nx.degree_centrality(graph).values()))

    closeness_centrality = np.mean(list(nx.closeness_centrality(graph).values()))
    pagerank = np.mean(list(nx.pagerank(graph).values()))
    
    # More features can be added here
    
    features = np.array([avg_degree, avg_clustering, degree_centrality, closeness_centrality, pagerank])
    return features


In [53]:
graph_list, y = create_graphs()

genre: Action
year: 1915
genre: Action
year: 1918
genre: Action
year: 1921
genre: Action
year: 1924
genre: Action
year: 1927
genre: Action
year: 1930
genre: Action
year: 1933
genre: Action
year: 1936
genre: Action
year: 1939
genre: Action
year: 1942
genre: Action
year: 1945
genre: Action
year: 1948
genre: Action
year: 1951
genre: Action
year: 1954
genre: Action
year: 1957
genre: Action
year: 1960
genre: Action
year: 1963
genre: Action
year: 1966
genre: Action
year: 1969
genre: Action
year: 1972
genre: Action
year: 1975
genre: Action
year: 1978
genre: Action
year: 1981
genre: Action
year: 1984
genre: Action
year: 1987
genre: Action
year: 1990
genre: Action
year: 1993
genre: Action
year: 1996
genre: Action
year: 1999
genre: Action
year: 2002
genre: Action
year: 2005
genre: Action
year: 2008
genre: Action
year: 2011
genre: Action
year: 2014
genre: Action
year: 2017
genre: Adventure
year: 1915
genre: Adventure
year: 1918
genre: Adventure
year: 1921
genre: Adventure
year: 1924
genre: Advent

In [60]:
y

['Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Sci-Fi',
 'Sci-Fi',
 'Sci-Fi',
 'Sci-Fi',
 'Sci-Fi',
 'Sci-Fi',
 'Sci-Fi',
 'Sci-Fi',
 'Sci-Fi',
 'Sci-Fi',
 'Sci-Fi',
 'Sc

In [63]:
def make_data(graph_list):
    X = []
    new_y = []
    for i,g in enumerate(graph_list):
        if not nx.is_empty(g.graph):
            X.append(g.graph)
            new_y.append(y[i])
    return X, new_y

In [None]:
s = set()
for graph,genre in zip(graph_list, y):
    if genre not in s:
        s.add(genre)
        graph.print()

In [64]:
X, y = make_data(graph_list)

In [71]:
len(X)

278

In [77]:
extracted_X = [extract_features(g) for g in X]

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(extracted_X, y, test_size=0.2, random_state=42)

In [81]:
# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.46
