In [2]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
from pathlib import Path
from sklearn.preprocessing import minmax_scale
from collections import Counter
import itertools
import numpy as np

# Opciones de Pandas para ver mejor los dataframes
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [3]:
# Usamos el c√≥digo que proporcionaste, con un fallback por si la ruta no es correcta
try:
    df = pd.read_csv(Path('../data/youtube_videos_final.csv'))
    print("CSV file loaded successfully!")
except FileNotFoundError:
    print("Warning: File not found at '../data/'. Trying local path './youtube_videos_final.csv'...")
    try:
        df = pd.read_csv(Path('./youtube_videos_final.csv'))
        print("CSV file loaded successfully from local path!")
    except FileNotFoundError:
        print(f"Error: File not found at {Path('./data/youtube_videos_final.csv').resolve()} or local path.")
        print("Por favor, aseg√∫rate de que el archivo 'youtube_videos_final.csv' est√© en la ruta correcta.")
        df = pd.DataFrame() # Crear DF vac√≠o para que el notebook no falle
except Exception as e:
    print(f"An error occurred: {e}")
    df = pd.DataFrame()

if not df.empty:
    print("\nData Head:")
    print(df.head())
    
    # Limpieza de datos b√°sica para las gr√°ficas
    df['tags'] = df['tags'].astype(str)

CSV file loaded successfully!

Data Head:
      video_id country trending_date  \
0  n1WpP7iowLc      CA      17.14.11   
1  0dBIkQ4Mz1M      CA      17.14.11   
2  5qpjK5DgCt4      CA      17.14.11   
3  d380meD0W0M      CA      17.14.11   
4  2Vv-BfVoq4g      CA      17.14.11   

                                               title channel_title  \
0         Eminem - Walk On Water (Audio) ft. Beyonc√©    EminemVEVO   
1                      PLUSH - Bad Unboxing Fan Mail     iDubbbzTV   
2  Racist Superman | Rudy Mancuso, King Bach & Le...  Rudy Mancuso   
3                           I Dare You: GOING BALD!?      nigahiga   
4        Ed Sheeran - Perfect (Official Music Video)    Ed Sheeran   

   category_id  category_name              publish_time  \
0           10          Music  2017-11-10T17:00:03.000Z   
1           23         Comedy  2017-11-13T17:00:00.000Z   
2           23         Comedy  2017-11-12T19:05:24.000Z   
3           24  Entertainment  2017-11-12T18:01:41.000Z   


In [None]:
# --- 1. Preparaci√≥n de Datos ---
# (Esta parte se queda igual, es eficiente)
df_music = df[df['category_name'] == 'Music'].copy()
df_music['tags_list'] = df_music['tags'].apply(lambda x: 
    list(set(tag.strip().lower() for tag in x.split('|') if tag.strip() and tag.strip() != '[none]'))
)
all_tags = list(itertools.chain(*df_music['tags_list']))
top_tags_count = Counter(all_tags).most_common(100)
top_tags = [tag for tag, count in top_tags_count]
node_sizes = {tag: count for tag, count in top_tags_count}
edge_list = []
for tags in df_music['tags_list']:
    filtered_tags = [tag for tag in tags if tag in top_tags]
    if len(filtered_tags) > 1:
        pairs = list(itertools.combinations(filtered_tags, 2))
        edge_list.extend(pairs)
edge_counts = Counter(edge_list).most_common(150)

# --- 2. Creaci√≥n del Grafo (NetworkX) ---
# (Esta parte tambi√©n se queda igual)
G_tags = nx.Graph()
for (u, v), weight in edge_counts:
    G_tags.add_edge(u, v, weight=weight)
for node in G_tags.nodes():
    G_tags.nodes[node]['size'] = node_sizes.get(node, 1)

# --- 3. Visualizaci√≥n (Plotly) ---
pos = nx.spring_layout(G_tags, k=0.3, iterations=50, seed=42)

# === INICIO DE LA SIMPLIFICACI√ìN ===
# En lugar del bucle complejo, creamos una sola traza para todos los bordes.

edge_x = []
edge_y = []
for edge in G_tags.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None]) # 'None' evita que se unan bordes que no deben
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.7, color='#888'),
    hoverinfo='none',
    mode='lines')

# === FIN DE LA SIMPLIFICACI√ìN ===


# Trazado de Nodos (esta parte ya era √≥ptima)
node_x = []
node_y = []
node_text = []
node_sizes_list = []
for node in G_tags.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    size = G_tags.nodes[node]['size']
    node_text.append(f"Tag: {node}<br>Frecuencia: {size}")
    node_sizes_list.append(size)

# Escalamos tama√±os de nodos
if len(set(node_sizes_list)) > 1:
    scaled_node_size = minmax_scale(node_sizes_list, (10, 40))
else:
    scaled_node_size = [20] * len(node_sizes_list)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    text=node_text,
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        reversescale=True,
        color=scaled_node_size,
        size=scaled_node_size,
        colorbar=dict(
            thickness=15,
            title=dict(text='Frecuencia de Tag', side='right'),
            xanchor='left'
        ),
        line_width=2,
        line_color='#333'
    )
)

# --- 4. Renderizado de Figura ---
fig = go.Figure(data=[edge_trace, node_trace], # Se incluyen las nuevas trazas simplificadas
             layout=go.Layout(
                title=dict(
                    text='<br>Red de Co-ocurrencia de Tags en "M√∫sica" (Simplificado)',
                    font=dict(size=16)
                ),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20, l=5, r=5, t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                plot_bgcolor='#f4f4f4'
                )
            )
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# --- 1. Preparaci√≥n de Datos ---
# Agregamos vistas por canal y categor√≠a
df_channel_cat = df.groupby(['channel_title', 'category_name'])['views'].sum().reset_index()

# Para una gr√°fica limpia, filtramos por el Top 50 de canales y Top 20 de categor√≠as
top_channels = df.groupby('channel_title')['views'].sum().nlargest(50).index
top_categories = df.groupby('category_name')['views'].sum().nlargest(20).index

df_filtered = df_channel_cat[
    df_channel_cat['channel_title'].isin(top_channels) & 
    df_channel_cat['category_name'].isin(top_categories)
]

# Guardamos los totales de vistas para el tama√±o de los nodos
channel_views = df.groupby('channel_title')['views'].sum().to_dict()
category_views = df.groupby('category_name')['views'].sum().to_dict()

# --- 2. Creaci√≥n del Grafo Bipartito (NetworkX) ---
B = nx.Graph()

# A√±adimos nodos con atributos (tipo y vistas totales)
for channel in df_filtered['channel_title'].unique():
    B.add_node(channel, type='channel', views=channel_views.get(channel, 0))

for category in df_filtered['category_name'].unique():
    B.add_node(category, type='category', views=category_views.get(category, 0))

# A√±adimos bordes con peso (vistas de esa combinaci√≥n)
edges = []
for _, row in df_filtered.iterrows():
    edges.append((row['channel_title'], row['category_name'], row['views']))

B.add_weighted_edges_from(edges, weight='views')

# --- 3. Visualizaci√≥n Bipartita (Plotly) ---

# Posicionamiento Bipartito
pos = {}
channel_nodes = [n for n, d in B.nodes(data=True) if d['type'] == 'channel']
category_nodes = [n for n, d in B.nodes(data=True) if d['type'] == 'category']

# Asignamos posiciones (Canales a la izquierda, Categor√≠as a la derecha)
pos.update((node, (1, i)) for i, node in enumerate(channel_nodes))
pos.update((node, (2, i * (len(channel_nodes) / len(category_nodes)))) for i, node in enumerate(category_nodes))

# Trazado de Bordes (Edges)
edge_x = []
edge_y = []
edge_weights = []
for u, v, data in B.edges(data=True):
    x0, y0 = pos[u]
    x1, y1 = pos[v]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])
    edge_weights.append(data['weight'])

# Escalamos pesos para la opacidad de la l√≠nea
scaled_edge_alpha = minmax_scale(edge_weights, (0.1, 0.7))

# Creamos un trazo por cada borde para controlar la opacidad
edge_traces = []
for i, edge in enumerate(B.edges()):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    weight = B.edges[edge]['weight']
    alpha = scaled_edge_alpha[i]
    
    edge_traces.append(
        go.Scatter(
            x=[x0, x1, None],
            y=[y0, y1, None],
            line=dict(width=1.5, color=f'rgba(50, 50, 50, {alpha})'),
            hoverinfo='text',
            text=f"Vistas: {weight:,.0f}",
            mode='lines'
        )
    )

# Trazado de Nodos (Canales y Categor√≠as por separado)
node_x_ch, node_y_ch, node_text_ch, node_size_ch = [], [], [], []
node_x_cat, node_y_cat, node_text_cat, node_size_cat = [], [], [], []

for node, data in B.nodes(data=True):
    x, y = pos[node]
    views = data['views']
    text = f"<b>{node}</b><br>Vistas Totales: {views:,.0f}"
    
    if data['type'] == 'channel':
        node_x_ch.append(x)
        node_y_ch.append(y)
        node_text_ch.append(text)
        node_size_ch.append(views)
    else:
        node_x_cat.append(x)
        node_y_cat.append(y)
        node_text_cat.append(text)
        node_size_cat.append(views)

# Escalamos tama√±os de nodos
scaled_size_ch = minmax_scale(node_size_ch, (10, 50))
scaled_size_cat = minmax_scale(node_size_cat, (10, 50))

# Trazos de Nodos
trace_channels = go.Scatter(
    x=node_x_ch, y=node_y_ch,
    mode='markers',
    hoverinfo='text',
    text=node_text_ch,
    marker=dict(
        size=scaled_size_ch,
        color='#0077b6', # Azul
        line=dict(width=2, color='#333')
    ),
    name='Canal'
)

trace_categories = go.Scatter(
    x=node_x_cat, y=node_y_cat,
    mode='markers',
    hoverinfo='text',
    text=node_text_cat,
    marker=dict(
        size=scaled_size_cat,
        color='#d9534f', # Rojo
        line=dict(width=2, color='#333')
    ),
    name='Categor√≠a'
)

# --- 4. Renderizado de Figura ---
fig = go.Figure(data=edge_traces + [trace_channels, trace_categories],
              layout=go.Layout(
                title='<br>Red Bipartita: Conexiones Canal-Categor√≠a (Top 50 Canales, Top 20 Categor√≠as)',
                titlefont_size=16,
                showlegend=True,
                legend_title_text='Tipo de Nodo',
                hovermode='closest',
                margin=dict(b=5, l=5, r=5, t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                plot_bgcolor='white',
                paper_bgcolor='#f9f9f9'
            ))
fig.show()