In [2]:
import pandas as pd
import json
import gzip
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# CONVERTIR A PANDAS DF

In [44]:
base_dir = Path().resolve().parents[1]
data_dir = base_dir / 'data'

file = data_dir / 'raw' / 'youtube_statistics.jsonl.gz'
# 
rows = []
with gzip.open(file, "rt", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,id,name,video_statistics
0,918450,Dumb Little Creatures,"[{'id': 'EZOQb8jQZmc', 'video_statistics': {'v..."
1,908950,Glow Chess,[]
2,3565650,Drone Simulator VR,[]
3,3829660,Catgirl Simulator,"[{'id': '2kyrTLNzJv8', 'video_statistics': {'v..."
4,4015080,Beat Shapes,[]


In [45]:
df = df.join(df["video_statistics"].apply(pd.Series))
df.head(2)

Unnamed: 0,id,name,video_statistics,0,1,2,3,4,5,6,...,15,16,17,18,19,20,21,22,23,24
0,918450,Dumb Little Creatures,"[{'id': 'EZOQb8jQZmc', 'video_statistics': {'v...","{'id': 'EZOQb8jQZmc', 'video_statistics': {'vi...","{'id': 'KdS95Dx_K4s', 'video_statistics': {'vi...",,,,,,...,,,,,,,,,,
1,908950,Glow Chess,[],,,,,,,,...,,,,,,,,,,


In [46]:
df.drop(df.columns[7:],axis=1, inplace=True)
df.drop('video_statistics', axis=1, inplace=True)
df.head(2)

Unnamed: 0,id,name,0,1,2,3
0,918450,Dumb Little Creatures,"{'id': 'EZOQb8jQZmc', 'video_statistics': {'vi...","{'id': 'KdS95Dx_K4s', 'video_statistics': {'vi...",,
1,908950,Glow Chess,,,,


Sacar columnas por cada video

In [47]:
video_cols = [0, 1, 2, 3]

def flatten_video_dict(d, prefix):
    """
    Aplana un diccionario anidado y añade prefijo a las columnas.
    """
    if pd.isna(d):
        return pd.Series(dtype=object)

    flat = pd.json_normalize(d).iloc[0]
    flat.index = [f"{prefix}_{col}" for col in flat.index]
    return flat


# Creamos una lista con los dataframes a unir
dfs = []

for col in video_cols:
    flattened = df[col].apply(lambda x: flatten_video_dict(x, f"video_{col}"))
    dfs.append(flattened)

# Unimos todo al dataframe original
df_final = pd.concat([df] + dfs, axis=1)
df_final.head(2)

Unnamed: 0,id,name,0,1,2,3,video_0_id,video_0_video_statistics.viewCount,video_0_video_statistics.likeCount,video_0_video_statistics.favoriteCount,...,video_2_id,video_2_video_statistics.viewCount,video_2_video_statistics.likeCount,video_2_video_statistics.favoriteCount,video_2_video_statistics.commentCount,video_3_id,video_3_video_statistics.viewCount,video_3_video_statistics.likeCount,video_3_video_statistics.favoriteCount,video_3_video_statistics.commentCount
0,918450,Dumb Little Creatures,"{'id': 'EZOQb8jQZmc', 'video_statistics': {'vi...","{'id': 'KdS95Dx_K4s', 'video_statistics': {'vi...",,,EZOQb8jQZmc,179.0,4.0,0.0,...,,,,,,,,,,
1,908950,Glow Chess,,,,,,,,,...,,,,,,,,,,


In [48]:
df_final.drop([0,1,2,3, 'video_0_id', 'video_1_id', 'video_2_id', 'video_3_id'], axis=1,inplace=True)
df_final = df_final.fillna(0)
df_final

Unnamed: 0,id,name,video_0_video_statistics.viewCount,video_0_video_statistics.likeCount,video_0_video_statistics.favoriteCount,video_0_video_statistics.commentCount,video_1_video_statistics.viewCount,video_1_video_statistics.likeCount,video_1_video_statistics.favoriteCount,video_1_video_statistics.commentCount,video_2_video_statistics.viewCount,video_2_video_statistics.likeCount,video_2_video_statistics.favoriteCount,video_2_video_statistics.commentCount,video_3_video_statistics.viewCount,video_3_video_statistics.likeCount,video_3_video_statistics.favoriteCount,video_3_video_statistics.commentCount
0,918450,Dumb Little Creatures,179,4,0,0,12,0,0,0,0,0,0,0,0,0,0,0
1,908950,Glow Chess,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3565650,Drone Simulator VR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3829660,Catgirl Simulator,357044,8745,0,325,70122,1780,0,173,67772,2002,0,163,49552,1560,0,89
4,4015080,Beat Shapes,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3330,3565250,Crossing Chains,163,2,0,0,15,2,0,0,0,0,0,0,0,0,0,0
3331,2336550,Chinese Chess Party (Xiangqi),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3332,440750,Story of a Cube,1355,6,0,1,542,10,0,0,391,14,0,3,387,6,0,2
3333,431510,Mystic Destinies: Serendipity of Aeons,210,1,0,0,164,2,0,0,137,0,0,0,86,0,0,0


In [50]:
cols_to_int = [
    c for c in df_final.columns
    if "video_statistics" in c
]

df_final[cols_to_int] = (
    df_final[cols_to_int]
        .apply(pd.to_numeric, errors="coerce")  # convierte strings a número
        .astype("Int64")                        # entero nullable de pandas
)

df_final.dtypes

id                                          str
name                                        str
video_0_video_statistics.viewCount        Int64
video_0_video_statistics.likeCount        Int64
video_0_video_statistics.favoriteCount    Int64
video_0_video_statistics.commentCount     Int64
video_1_video_statistics.viewCount        Int64
video_1_video_statistics.likeCount        Int64
video_1_video_statistics.favoriteCount    Int64
video_1_video_statistics.commentCount     Int64
video_2_video_statistics.viewCount        Int64
video_2_video_statistics.likeCount        Int64
video_2_video_statistics.favoriteCount    Int64
video_2_video_statistics.commentCount     Int64
video_3_video_statistics.viewCount        Int64
video_3_video_statistics.likeCount        Int64
video_3_video_statistics.favoriteCount    Int64
video_3_video_statistics.commentCount     Int64
dtype: object

In [51]:
df_final.to_parquet('../../data/processed/yt_stats.parquet')

In [None]:
df_final.sort_values('video_0_video_statistics.viewCount', ascending=False  )

Unnamed: 0,id,name,video_0_video_statistics.viewCount,video_0_video_statistics.likeCount,video_0_video_statistics.favoriteCount,video_0_video_statistics.commentCount,video_1_video_statistics.viewCount,video_1_video_statistics.likeCount,video_1_video_statistics.favoriteCount,video_1_video_statistics.commentCount,video_2_video_statistics.viewCount,video_2_video_statistics.likeCount,video_2_video_statistics.favoriteCount,video_2_video_statistics.commentCount,video_3_video_statistics.viewCount,video_3_video_statistics.likeCount,video_3_video_statistics.favoriteCount,video_3_video_statistics.commentCount
3130,1670870,MADiSON,655229413,6371358,0,256255,241341477,2919422,0,161907,114903335,1562352,0,32857,0,0,0,0
1801,1407200,World of Tanks,268278482,1541326,0,19953,13929131,66882,0,720,13440666,133130,0,6193,8682067,70529,0,1917
2511,1092710,Hello Neighbor Alpha 1,250673092,1231418,0,0,170117648,746529,0,0,123749309,604113,0,0,109836740,515762,0,0
1736,1044180,Planet,168750022,569832,0,10848,0,0,0,0,0,0,0,0,0,0,0,0
1186,733430,The Escape,113999758,78637,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3309,1696310,Starveling Way,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3328,2810370,Bal,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3329,1515040,Bloks,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3331,2336550,Chinese Chess Party (Xiangqi),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

video_cols = [0, 1, 2, 3]

for i in video_cols:
    # 1️⃣ columnas del video i
    cols = [c for c in df_final.columns if c.startswith(f"video_{i}_")]
    
    # 2️⃣ solo numéricas
    X = df_final[cols].select_dtypes(include="number").fillna(0)
    
    # 3️⃣ si no hay suficientes columnas numéricas, saltamos
    if X.shape[1] < 2:
        print(f"Video {i}: no hay suficientes features numéricas para PCA")
        continue
    
    # 4️⃣ escalar
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 5️⃣ PCA (2 componentes)
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X_scaled)
    
    # 6️⃣ añadir al dataframe final
    df_final[f"video_{i}_pca_1"] = X_pca[:, 0]
    df_final[f"video_{i}_pca_2"] = X_pca[:, 1]

df_final

Unnamed: 0,id,name,video_0_id,video_0_video_statistics.viewCount,video_0_video_statistics.likeCount,video_0_video_statistics.favoriteCount,video_0_video_statistics.commentCount,video_1_id,video_1_video_statistics.viewCount,video_1_video_statistics.likeCount,...,video_3_video_statistics.favoriteCount,video_3_video_statistics.commentCount,video_0_pca_1,video_0_pca_2,video_1_pca_1,video_1_pca_2,video_2_pca_1,video_2_pca_2,video_3_pca_1,video_3_pca_2
0,918450,Dumb Little Creatures,EZOQb8jQZmc,179,4,0,0,KdS95Dx_K4s,12,0,...,0,0,-0.204810,-0.017111,-0.171703,0.008365,-0.184052,-0.040966,-0.176583,-0.036368
1,908950,Glow Chess,0,0,0,0,0,0,0,0,...,0,0,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
2,3565650,Drone Simulator VR,0,0,0,0,0,0,0,0,...,0,0,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
3,3829660,Catgirl Simulator,2kyrTLNzJv8,357044,8745,0,325,ZeE_sigOaPY,70122,1780,...,0,89,-0.124715,0.003800,-0.122401,-0.016417,-0.071304,0.038720,-0.082119,0.003193
4,4015080,Beat Shapes,0,0,0,0,0,0,0,0,...,0,0,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3330,3565250,Crossing Chains,1mZqSPsy-sY,163,2,0,0,iRYQs0V5EcI,15,2,...,0,0,-0.204819,-0.017110,-0.171685,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
3331,2336550,Chinese Chess Party (Xiangqi),0,0,0,0,0,0,0,0,...,0,0,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
3332,440750,Story of a Cube,_Eq8d7CJCMk,1355,6,0,1,JLopkgphCY0,542,10,...,0,2,-0.204665,-0.017046,-0.171565,0.008420,-0.182505,-0.039284,-0.175491,-0.035162
3333,431510,Mystic Destinies: Serendipity of Aeons,pBACG7O1QmM,210,1,0,0,hWkAda9Ue6U,164,2,...,0,0,-0.204821,-0.017112,-0.171671,0.008381,-0.184030,-0.040990,-0.176563,-0.036388


In [42]:

df_final.drop(df_final.columns[2:22], axis=1, inplace=True)
df_final

Unnamed: 0,id,name,video_0_pca_1,video_0_pca_2,video_1_pca_1,video_1_pca_2,video_2_pca_1,video_2_pca_2,video_3_pca_1,video_3_pca_2
0,918450,Dumb Little Creatures,-0.204810,-0.017111,-0.171703,0.008365,-0.184052,-0.040966,-0.176583,-0.036368
1,908950,Glow Chess,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
2,3565650,Drone Simulator VR,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
3,3829660,Catgirl Simulator,-0.124715,0.003800,-0.122401,-0.016417,-0.071304,0.038720,-0.082119,0.003193
4,4015080,Beat Shapes,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
...,...,...,...,...,...,...,...,...,...,...
3330,3565250,Crossing Chains,-0.204819,-0.017110,-0.171685,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
3331,2336550,Chinese Chess Party (Xiangqi),-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
3332,440750,Story of a Cube,-0.204665,-0.017046,-0.171565,0.008420,-0.182505,-0.039284,-0.175491,-0.035162
3333,431510,Mystic Destinies: Serendipity of Aeons,-0.204821,-0.017112,-0.171671,0.008381,-0.184030,-0.040990,-0.176563,-0.036388


In [43]:
rename_map = {}

for i in [0, 1, 2, 3]:
    rename_map[f"video_{i}_pca_1"] = f"video_{i}_popularity_score"
    rename_map[f"video_{i}_pca_2"] = f"video_{i}_engagement_score"

df_final = df_final.rename(columns=rename_map)
df_final

Unnamed: 0,id,name,video_0_popularity_score,video_0_engagement_score,video_1_popularity_score,video_1_engagement_score,video_2_popularity_score,video_2_engagement_score,video_3_popularity_score,video_3_engagement_score
0,918450,Dumb Little Creatures,-0.204810,-0.017111,-0.171703,0.008365,-0.184052,-0.040966,-0.176583,-0.036368
1,908950,Glow Chess,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
2,3565650,Drone Simulator VR,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
3,3829660,Catgirl Simulator,-0.124715,0.003800,-0.122401,-0.016417,-0.071304,0.038720,-0.082119,0.003193
4,4015080,Beat Shapes,-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
...,...,...,...,...,...,...,...,...,...,...
3330,3565250,Crossing Chains,-0.204819,-0.017110,-0.171685,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
3331,2336550,Chinese Chess Party (Xiangqi),-0.204833,-0.017102,-0.171704,0.008363,-0.184052,-0.040966,-0.176583,-0.036368
3332,440750,Story of a Cube,-0.204665,-0.017046,-0.171565,0.008420,-0.182505,-0.039284,-0.175491,-0.035162
3333,431510,Mystic Destinies: Serendipity of Aeons,-0.204821,-0.017112,-0.171671,0.008381,-0.184030,-0.040990,-0.176563,-0.036388


# PRUEBA DE PCA

In [15]:
features = [
    "video_statistics.viewCount",
    "video_statistics.likeCount",
    "video_statistics.favoriteCount",
    "video_statistics.commentCount",
]

X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-0.10628278, -0.11882428,  0.        , -0.129773  ],
       [-0.10629452, -0.11885225,  0.        , -0.129773  ],
       [-0.10629452, -0.11885225,  0.        , -0.129773  ],
       ...,
       [-0.10620564, -0.1188103 ,  0.        , -0.12960837],
       [-0.10628075, -0.11884526,  0.        , -0.129773  ],
       [-0.10629452, -0.11885225,  0.        , -0.129773  ]],
      shape=(3335, 4))

In [16]:

pca = PCA()
X_pca = pca.fit_transform(X_scaled)

explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.89965986, 0.08223429, 0.01810585, 0.        ])

In [17]:
pca = PCA(n_components=2)
pca.fit(X_scaled)
loadings = pd.DataFrame(
    pca.components_,
    columns=features,
    index=["PC1", "PC2"]
)

loadings

Unnamed: 0,video_statistics.viewCount,video_statistics.likeCount,video_statistics.favoriteCount,video_statistics.commentCount
PC1,0.568042,0.597947,-0.0,0.565497
PC2,-0.692746,-0.023579,0.0,0.720796


In [18]:
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

df["PC1"] = X_pca_2d[:, 0]
df["PC2"] = X_pca_2d[:, 1]
df

Unnamed: 0,id_x,name,video_statistics.viewCount,video_statistics.likeCount,video_statistics.favoriteCount,video_statistics.commentCount,PC1,PC2
0,918450,Dumb Little Creatures,179,4,0,0,-0.204810,-0.017111
1,908950,Glow Chess,0,0,0,0,-0.204833,-0.017102
2,3565650,Drone Simulator VR,0,0,0,0,-0.204833,-0.017102
3,3829660,Catgirl Simulator,357044,8745,0,325,-0.124715,0.003800
4,4015080,Beat Shapes,0,0,0,0,-0.204833,-0.017102
...,...,...,...,...,...,...,...,...
3330,3565250,Crossing Chains,163,2,0,0,-0.204819,-0.017110
3331,2336550,Chinese Chess Party (Xiangqi),0,0,0,0,-0.204833,-0.017102
3332,440750,Story of a Cube,1355,6,0,1,-0.204665,-0.017046
3333,431510,Mystic Destinies: Serendipity of Aeons,210,1,0,0,-0.204821,-0.017112
