## indlæs, lav graf

In [None]:
import pandas as pd
import itertools
import networkx as nx
import matplotlib.pyplot as plt

# 1. Læs data
df = pd.read_csv("data/overview.csv")
movies_df = pd.read_csv("data/movies.csv")

# 2. Rens cast-kolonne
df["cast_names"] = df["cast_names"].fillna("").astype(str)

# 3. Lav cast_list
df["cast_list"] = df["cast_names"].apply(
    lambda s: [c.strip() for c in s.split("|") if c.strip() != ""]
)

# 4. Join overview med movies
merged_df = df.merge(movies_df[["movieId", "genres"]], on="movieId", how="left")

# 5. Split genres til liste
merged_df["genres"] = merged_df["genres"].fillna("")
merged_df["genre_list"] = merged_df["genres"].apply(
    lambda s: [g.strip() for g in s.split("|") if g.strip() != ""]
)
# tæl forekomster af hver genre


# 6. Filtrér på en bestemt genre (præcist match)
target_genre = "Action"   # ← Skift her


genre_df = merged_df[merged_df["genre_list"].apply(lambda lst: target_genre in lst) ].copy()

# tag links csv filen og sørg for den kun indeholder film fra genre_df
#links_df = pd.read_csv("data/links.csv")
#links_df = links_df[ links_df["movieId"].isin( genre_df["movieId"] ) ]
#print(links_df.head())
# giv mig links_df som csv eksporter
#links_df.to_csv("data/links_action.csv", index=False)
# 7. Tjek resultat
#print(genre_df.head())
# antal film før filtrering
print("Antal film i alt:", len(merged_df))
print("Antal film i genren:", len(genre_df))
# vis genre kolonenn
#print(merged_df[ "genre_list"].head())

In [None]:
all_genres = list(itertools.chain.from_iterable(merged_df["genre_list"]))
genre_counts = pd.Series(all_genres).value_counts()
print("Genre counts:")
print(genre_counts)


In [None]:
import itertools
import networkx as nx

# 4. Opret graf
G = nx.Graph()

for cast in genre_df["cast_list"]:
    # Tilføj noder (skuespillere)
    for actor in cast:
        if actor not in G:
            G.add_node(actor)
    
    # Tilføj kanter for alle par af skuespillere i samme film
    for a, b in itertools.combinations(cast, 2):
        if G.has_edge(a, b):
            # øg vægten hvis kanten allerede findes
            G[a][b]["weight"] += 1
        else:
            G.add_edge(a, b, weight=1)

# 5. Fjern isolerede noder (valgfrit, men anbefales)
H = G.copy()
isolated = [n for n, d in H.degree() if d == 0]
H.remove_nodes_from(isolated)

# Info
print("Original graf: {} noder, {} kanter".format(G.number_of_nodes(), G.number_of_edges()))
print("Renset graf uden isolerede noder: {} noder, {} kanter".format(H.number_of_nodes(), H.number_of_edges()))


## Graf undersøgelse

- Degree Distribtuion
- Hvem har lavest og højest degree distribution
- Max/min degree

In [None]:
# Degree distribution plot
degrees = [d for n, d in H.degree()]
plt.hist(degrees, bins=range(1, max(degrees)+1), edgecolor='black')
plt.title("Degree Distribution")
plt.xlabel("Degree")
plt.ylabel("Number of Nodes")
plt.yscale('log')  # Log-skala for bedre visualisering
plt.show()

In [None]:
# Højst og lavest degree
degree_sequence = sorted(H.degree(), key=lambda x: x[1], reverse=True)
print("\nTop 5 skuespillere efter degree:")
for actor, degree in degree_sequence[:5]:
    print(f"{actor}: {degree}")
print("\nBottom 5 skuespillere efter degree:")
for actor, degree in degree_sequence[-5:]:
    print(f"{actor}: {degree}")

In [None]:
# Max og min degree
max_degree_actor, max_degree = degree_sequence[0]
min_degree_actor, min_degree = degree_sequence[-1]
print("\nSkuespiller med højst degree: {} ({})".format(max_degree_actor, max_degree))
print("Skuespiller med lavest degree: {} ({})".format(min_degree_actor, min_degree))

## Community

### Community TF-IDF 

- Modularity
- Structural Communities (Louvain algorithm) to find optimal structural communities
- Purity metrics - Community purity


In [None]:
# Find GCC
components = nx.connected_components(H)
gcc_nodes = max(components, key=len)

# Lav en subgraf for GCC
H_gcc = H.subgraph(gcc_nodes).copy()

print(f"Størrelse af GCC: {H_gcc.number_of_nodes()} nodes, {H_gcc.number_of_edges()} edges")

# Kør Louvain på GCC
from community import community_louvain

partition_gcc = community_louvain.best_partition(H_gcc, weight="weight")

# Antal communities
num_comms = len(set(partition_gcc.values()))
print(f"Louvain på GCC fandt {num_comms} communities")

# Modularity
Q = community_louvain.modularity(partition_gcc, H_gcc)
print(f"Modularity på GCC: {Q:.4f}")
from collections import Counter

community_sizes_gcc = Counter(partition_gcc.values())

# Sorter fra største til mindste
for comm, size in community_sizes_gcc.most_common(20):
    print(f"Community {comm}: {size} skuespillere")


In [None]:
import networkx as nx
from collections import defaultdict

community_names = {}
community_hubs = {}

# Find top-1 hubs + navngiv
for comm in set(partition_gcc.values()):
    # noder i community
    nodes = [n for n, c in partition_gcc.items() if c == comm]
    sub = H_gcc.subgraph(nodes)
    
    # weighted degree
    degrees = sub.degree(weight="weight")
    
    # sortér efter degree (højeste først)
    top1 = sorted(degrees, key=lambda x: x[1], reverse=True)[:1]
    community_hubs[comm] = top1
    
    # lav community-navn ud fra top-1
    hub_names = [actor for actor, deg in top1]
    community_name = " - ".join(hub_names)
    
    community_names[comm] = community_name

# Udskriv de 20 største communities med auto-navn
from collections import Counter
sizes = Counter(partition_gcc.values())

print("Top 20 communities med navne:")
for comm, size in sizes.most_common(20):
    print(f"Community {comm} (size {size}) → {community_names[comm]}")


In [None]:
movie_overview = dict(zip(genre_df["movieId"], genre_df["overview"]))
from collections import defaultdict

actor_movies = defaultdict(list)

for movie_id, cast in zip(genre_df["movieId"], genre_df["cast_list"]):
    for actor in cast:
        actor_movies[actor].append(movie_id)

print(actor_movies["Bruce Willis"])

In [None]:
from collections import defaultdict

community_corpus = defaultdict(list)

for actor, comm in partition_gcc.items():
    for movie_id in actor_movies.get(actor, []):
        overview = movie_overview.get(movie_id, "")
        if isinstance(overview, str) and len(overview.strip()) > 0:
            community_corpus[comm].append(overview)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)

community_keywords = {}

for comm, docs in community_corpus.items():
    if len(docs) < 10:  # skip tiny communities
        continue
    
    tfidf = vectorizer.fit_transform(docs)
    features = vectorizer.get_feature_names_out()
    
    avg_scores = tfidf.mean(axis=0).A1
    top_idx = avg_scores.argsort()[-15:][::-1]
    
    keywords = [features[i] for i in top_idx]
    community_keywords[comm] = keywords


In [None]:
top_communities = [65, 54, 3, 9, 11, 25, 32, 4, 47, 63]

for comm in top_communities:
    print(f"\nCommunity {comm}: {community_names[comm]}")
    print("Keywords:")
    print(", ".join(community_keywords.get(comm, ["No data"])))


### Sentiment-analysis

In [None]:
import pandas as pd
import re
import numpy as np

# Load LabMT data
labmt = pd.read_csv("Data_Set_S1.txt", sep="\t")

# Dictionary: ord → happiness score
labmt_dict = dict(zip(
    labmt["word"].astype(str).str.lower(),
    labmt["happiness_average"]
))

def tokenize(text):
    return re.findall(r"[a-z']+", str(text).lower())



In [None]:
def labmt_sentiment(text, word_dict, lens=1.0, center=5.0):
    tokens = tokenize(text)
    scores = [word_dict[t] for t in tokens if t in word_dict]

    # fjern neutrale ord (fx "movie", "film", "time", "man"...)
    filtered = [s for s in scores if s < center - lens or s > center + lens]

    if not filtered:
        return None

    return float(np.mean(filtered))


In [None]:
reviews_df = pd.read_csv("data/reviews.csv")

reviews_df["sentiment"] = reviews_df["reviews"].apply(
    lambda text: labmt_sentiment(text, labmt_dict)
)

In [None]:
movie_sentiment = (
    reviews_df.groupby("movieId")["sentiment"].mean().to_dict()
)

# fjern alle NaN værdier
movie_sentiment = {m: s for m, s in movie_sentiment.items() if s == s}

print(movie_sentiment)


In [None]:
import numpy as np

actor_sentiment = {}

for actor, movies in actor_movies.items():
    vals = []
    for m in movies:
        if m in movie_sentiment:
            if not np.isnan(movie_sentiment[m]):   # eksplicit nan-check
                vals.append(movie_sentiment[m])
    
    actor_sentiment[actor] = np.mean(vals) if len(vals) > 0 else None


In [None]:
print(actor_sentiment)

In [None]:
from collections import defaultdict
import numpy as np

community_sentiment = defaultdict(list)

for actor, comm in partition_gcc.items():
    if actor_sentiment[actor] is not None:
        community_sentiment[comm].append(actor_sentiment[actor])

community_sentiment_mean = {
    comm: np.mean(vals) for comm, vals in community_sentiment.items()
}


In [None]:
sorted_sent = sorted(
    community_sentiment_mean.items(),
    key=lambda x: x[1],
    reverse=True
)

sorted_sent[:10]   # top 10 mest positive communities


In [None]:
linked = []

for comm, sent in sorted_sent[:10]:
    name = community_names.get(comm, "Unknown")
    linked.append({
        "community": comm,
        "name": name,
        "sentiment": sent
    })

linked

## Network Modellering

- Small World
- Scale Free
- Power Law Exponent
- Bipartite Network (anti block diagonals and perform projections)

In [None]:
import networkx as nx

# Largest connected component (LCC)
Hcc = H.subgraph(max(nx.connected_components(H), key=len)).copy()

print("LCC nodes:", Hcc.number_of_nodes())
print("LCC edges:", Hcc.number_of_edges())


In [None]:
L_H = nx.average_shortest_path_length(Hcc)
C_H = nx.average_clustering(Hcc)

print("Average shortest path length (H):", L_H)
print("Average clustering (H):", C_H)


In [None]:
N = Hcc.number_of_nodes()
M = Hcc.number_of_edges()

H_er = nx.gnm_random_graph(N, M)

L_er = nx.average_shortest_path_length(H_er)
C_er = nx.average_clustering(H_er)

print("ER shortest path:", L_er)
print("ER clustering:", C_er)


## Centrality

- Top 5 nodes for Degree Centrality
- Top 5 nodes for Betweenness Centrality
- Top 5 nodes for Eigenvector Centrality

In [None]:
# Degree centrality
degree_centrality = nx.degree_centrality(G)
sorted_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
print("\nTop 5 skuespillere efter degree centrality:")
for actor, centrality in sorted_degree[:5]:
    print(f"{actor}: {centrality:.4f}")

In [None]:
# Betweenness centrality
betweenness_centrality = nx.betweenness_centrality(G)
sorted_betweenness = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
print("\nTop 5 skuespillere efter betweenness centrality:")
for actor, centrality in sorted_betweenness[:5]:
    print(f"{actor}: {centrality:.4f}")

In [None]:
# Eigenvector centrality
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)
sorted_eigenvector = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)
print("\nTop 5 skuespillere efter eigenvector centrality:")
for actor, centrality in sorted_eigenvector[:5]:
    print(f"{actor}: {centrality:.4f}")

## Correlation

- Spearman
- Pearson


In [None]:
# Spearman korrelation mellem centralitetsmålene
from scipy.stats import spearmanr
degree_values = [v for k, v in sorted_degree]
betweenness_values = [v for k, v in sorted_betweenness]
eigenvector_values = [v for k, v in sorted_eigenvector]

## Assortativity

- Degree Assortativity
- Attribute Assortativity

In [None]:
# Degree assortativity
assortativity = nx.degree_assortativity_coefficient(G)
print("\nDegree assortativity koefficient:", assortativity)

## Visualization

- Layouts (spring_layout vs kamada_kawai_layout)
- ForceAtlas2 for aestethic visualizations
- Node size depending on degree
- Heatmaps, plot of in and out degrees
- Backbone - Displarity Filter & High-Salience Skeleteon (HSS) 

In [None]:
import networkx as nx
import math

def disparity_filter(G, alpha=0.05):
    """
    Extract backbone using the Disparity Filter method.
    G: weighted NetworkX graph
    alpha: significance threshold (default 0.05)
    Returns: backbone graph (NetworkX)
    """
    backbone = nx.Graph()
    backbone.add_nodes_from(G.nodes(data=True))

    for node in G.nodes():
        k = len(list(G.neighbors(node)))     # degree
        if k <= 1:
            # keep all edges for nodes with degree 1
            for nbr, data in G[node].items():
                backbone.add_edge(node, nbr, **data)
            continue

        # sum of incident weights
        w_sum = sum(data["weight"] for _, data in G[node].items())

        for nbr, data in G[node].items():
            w = data["weight"]
            p_ij = w / w_sum

            # Disparity Filter significance test
            alpha_ij = 1 - (1 - p_ij) ** (k - 1)

            if alpha_ij < alpha:
                # keep significant edge
                backbone.add_edge(node, nbr, **data)

    return backbone

# Anvend disparity filter på grafen H
backbone_H = disparity_filter(H, alpha=0.5)
print("\nBackbone graf efter Disparity Filter: {} noder, {} kanter".format(backbone_H.number_of_nodes(), backbone_H.number_of_edges()))

# plot
pos = nx.spring_layout(backbone_H, seed=42)
plt.figure(figsize=(12, 12))
nx.draw_networkx_nodes(backbone_H, pos, node_size=20, node_color='blue')
nx.draw_networkx_edges(backbone_H, pos, alpha=0.5)
plt.show()


In [None]:
import networkx as nx
from collections import defaultdict

def high_salience_skeleton(G, threshold=0.1, weight="weight"):
    """
    Extract backbone using the High Salience Skeleton method.
    
    G: weighted NetworkX graph
    threshold: salience threshold (0–1)
    weight: edge attribute used as distance
    
    Returns: backbone graph (NetworkX)
    """
    # Prepare salience counter
    salience = defaultdict(int)
    N = len(G.nodes())

    # Compute all-pairs shortest paths
    # For large graphs: this is expensive (O(N^3))
    for source in G.nodes():
        paths = nx.single_source_dijkstra_path(G, source, weight=weight)

        # Count edge usage
        for target, path in paths.items():
            if len(path) < 2:
                continue
            
            # For each edge in this shortest path, count usage
            for u, v in zip(path[:-1], path[1:]):
                if u < v:   # undirected normalization
                    salience[(u, v)] += 1
                else:
                    salience[(v, u)] += 1

    # Total number of paths = N*(N-1) / 2 for undirected
    total_paths = N * (N - 1) / 2

    # Build backbone graph
    backbone = nx.Graph()
    backbone.add_nodes_from(G.nodes(data=True))

    for (u, v), count in salience.items():
        S = count / total_paths   # salience score
        
        if S >= threshold:
            # copy attributes from original graph
            data = G[u][v]
            backbone.add_edge(u, v, **data, salience=S)

    return backbone


In [None]:
backbone_HSS = high_salience_skeleton(H, threshold=0.0005)
# Plot High Salience Skeleton backbone
pos = nx.spring_layout(backbone_HSS, seed=42)

plt.figure(figsize=(12, 12))
nx.draw_networkx_nodes(backbone_HSS, pos, node_size=20, node_color='red')
nx.draw_networkx_edges(backbone_HSS, pos, alpha=0.5)

plt.title("Backbone Graph efter High Salience Skeleton")
plt.show()
