## Imports

In [2]:
import os
import networkx as nx
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
PATH = "../datasets/filtered-data/"

movies = pd.read_csv(f"{PATH}filtered_movies.csv")
actors = pd.read_csv(f"{PATH}actors_actresses.csv")
crew = pd.read_csv(f"{PATH}filtered_crew.csv")
ratings = pd.read_csv(f"{PATH}filtered_ratings.csv")

## Netttoyer les donnees

In [4]:
print(actors.shape)

actor_film_counts = actors['nconst'].value_counts()

actors = actors[actors['nconst'].isin(actor_film_counts[actor_film_counts > 10].index)]
print(actors.shape)

(416137, 4)
(133834, 4)


In [5]:
movies = movies.dropna(subset=['genres'])
movies['genres'] = movies['genres'].apply(lambda x: x.split(','))

selected_genres = ['Drama', 'Comedy', 'Action', 'Romance']
min_num_movies = min([len(movies[movies['genres'].apply(lambda x: genre in x)]) for genre in selected_genres])

actors = actors.merge(movies[['tconst', 'primaryTitle', 'genres']], on='tconst')
print(actors.shape)

(133834, 6)


In [6]:
movie_to_actors = defaultdict(list)
for _, row in actors.iterrows():
    movie_to_actors[row['tconst']].append(row['nconst'])

### Extract features

In [7]:
def extract_features(graph):
    features = {}

    num_nodes = graph.number_of_nodes()
    num_edges = graph.number_of_edges()

    if num_nodes > 0:
        features['avg_degree'] = np.mean([d for n, d in graph.degree()])
    else:
        features['avg_degree'] = 0

    if num_nodes > 1:
        features['avg_clustering'] = nx.average_clustering(graph)
        features['degree_centrality'] = np.mean(list(nx.degree_centrality(graph).values()))
    else:
        features['avg_clustering'] = 0
        features['degree_centrality'] = 0

    return features


## Creation du graph de cooccurence

In [8]:
def create_cooccurrence_graph(data):
    genre_graphs = defaultdict(nx.Graph)

    for _, row in data.iterrows():
        for genre in row['genres']:
            actor_list = movie_to_actors[row['tconst']]
            for i in range(len(actor_list)):
                for j in range(i + 1, len(actor_list)):
                    actor_i, actor_j = actor_list[i], actor_list[j]
                    if actor_i != actor_j:
                        if not genre_graphs[genre].has_edge(actor_i, actor_j):
                            genre_graphs[genre].add_edge(actor_i, actor_j, weight=0)
                        genre_graphs[genre][actor_i][actor_j]['weight'] += 1

    return genre_graphs

genre_graphs = create_cooccurrence_graph(movies)

### Stats du graph

In [9]:
for genre, graph in genre_graphs.items():
    print(f"Genre: {genre}")
    print(f"Nombre d'acteurs: {graph.number_of_nodes()}")
    print(f"Nombre d'arêtes: {graph.number_of_edges()}")
    print(f"Degré moyen: {np.mean(list(dict(graph.degree()).values()))}")
    print(f"Clustering moyen: {nx.average_clustering(graph)}")
    print()

Genre: Crime
Nombre d'acteurs: 5685
Nombre d'arêtes: 55937
Degré moyen: 19.678803869832894
Clustering moyen: 0.41824377188894124

Genre: Drama
Nombre d'acteurs: 6201
Nombre d'arêtes: 142020
Degré moyen: 45.8055152394775
Clustering moyen: 0.23955844701219262

Genre: Mystery
Nombre d'acteurs: 4397
Nombre d'arêtes: 23268
Degré moyen: 10.583579713440983
Clustering moyen: 0.5673470883319413

Genre: History
Nombre d'acteurs: 2507
Nombre d'arêtes: 10093
Degré moyen: 8.051854806541684
Clustering moyen: 0.7217298456057789

Genre: War
Nombre d'acteurs: 2016
Nombre d'arêtes: 7342
Degré moyen: 7.283730158730159
Clustering moyen: 0.7198019555609797

Genre: Romance
Nombre d'acteurs: 5501
Nombre d'arêtes: 51255
Degré moyen: 18.63479367387748
Clustering moyen: 0.40919071258772144

Genre: Western
Nombre d'acteurs: 1281
Nombre d'arêtes: 7444
Degré moyen: 11.62217017954723
Clustering moyen: 0.7035040046518177

Genre: Comedy
Nombre d'acteurs: 6030
Nombre d'arêtes: 92457
Degré moyen: 30.665671641791043
Clu

### Plot le graph pour un genre

In [10]:
def plot_graph(graph, title):
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(graph, k=0.15)
    weights = nx.get_edge_attributes(graph, 'weight').values()
    nx.draw(graph, pos, node_size=50, font_size=10, edge_color=weights, edge_cmap=plt.cm.viridis, width=2)
    plt.title(title)
    plt.show()


### Save graph


In [11]:
import os

def save_graph(graph, title, filepath):
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(graph, k=0.15)
    weights = nx.get_edge_attributes(graph, 'weight').values()
    nx.draw(graph, pos, node_size=50, font_size=10, edge_color=weights, edge_cmap=plt.cm.viridis, width=2)
    plt.title(title)
    plt.savefig(filepath)
    plt.close()

if not os.path.exists("graphs"):
    os.makedirs("graphs")

In [12]:
data = []
labels = []

for genre in selected_genres:
    genre_movies = movies[movies['genres'].apply(lambda x: genre in x)].sample(min_num_movies)
    genre_movies = genre_movies.reset_index(drop=True)
    num_movies = len(genre_movies)
    for i in range(0, num_movies, 300):
        lot_movies = genre_movies.iloc[i:i+300]
        if not lot_movies.empty:
            genre_graphs = create_cooccurrence_graph(lot_movies)
            graph = genre_graphs[genre]
            features = extract_features(graph)
            data.append(features)
            labels.append(genre)
            genre_dir = os.path.join("graphs", genre)
            if not os.path.exists(genre_dir):
                os.makedirs(genre_dir)
            lot_number = i // 300 + 1
            filename = f'cooccurrence_graph_lot_{lot_number}.png'
            filepath = os.path.join(genre_dir, filename)
            save_graph(graph, f'Co-occurrence Graph for {genre} (Lot {lot_number})', filepath)

X = pd.DataFrame(data)
y = labels


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [14]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

      Action       1.00      1.00      1.00         6
      Comedy       0.33      0.11      0.17         9
       Drama       0.45      0.62      0.53         8
     Romance       0.30      0.43      0.35         7

    accuracy                           0.50        30
   macro avg       0.52      0.54      0.51        30
weighted avg       0.49      0.50      0.47        30

Accuracy: 0.5
