In [1]:
import pandas as pd
import glob
import os

# Define the path pattern for your CSV files
# Adjust the path according to your folder structure
path = 'results/*media_links*.csv'  # This will match all CSVs with 'media_links' in their name

# Create an empty list to store individual DataFrames
dfs = []

# Loop through all matching CSV files
for file in glob.glob(path):
    # Read each CSV file
    df = pd.read_csv(file)
    
    # Optionally add a column to identify the source file
    df['source_file'] = os.path.basename(file)
    
    # Append to our list
    dfs.append(df)

# Combine all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Optionally remove duplicates if any
combined_df = combined_df.drop_duplicates()


In [8]:
# Filter the DataFrame to include only rows with an mp4 link
mp4_df = combined_df[combined_df['mp4_url'].notna()]

# Rename columns to match the required format
mp4_df = mp4_df.rename(columns={
    'transcript_unique_id': 'transcript_id',
    'mp4_url': 'mp4_video_link'
})

# Add missing columns with None values


# Reorder columns to match the required format
mp4_df = mp4_df[['transcript_id', 'mp4_video_link'
]]

# Save the filtered DataFrame to a new CSV file
output_path_mp4 = 'results/mp4_media_links.csv'
mp4_df.to_csv(output_path_mp4, index=False)

# Print some information about the result
print(f"Filtered dataset saved to: {output_path_mp4}")
print(f"Total rows in filtered dataset: {len(mp4_df)}")

# Display the first few rows of the filtered dataset
print("\nFirst few rows of the filtered dataset:")
print(mp4_df.head())


Filtered dataset saved to: results/mp4_media_links.csv
Total rows in filtered dataset: 59186

First few rows of the filtered dataset:
         transcript_id                                     mp4_video_link
0  149_20190318T144449  https://vod.althingi.is/upptokur/old/20190318T...
1  149_20190318T144901  https://vod.althingi.is/upptokur/old/20190318T...
2  149_20190320T152530  https://vod.althingi.is/upptokur/old/20190320T...
3  149_20190320T165429  https://vod.althingi.is/upptokur/old/20190320T...
4  149_20190320T165846  https://vod.althingi.is/upptokur/old/20190320T...


In [10]:
# Filter the DataFrame to include only rows with an mp4 link since 2018
danish_mp4_df = combined_df[(combined_df['mp4_url'].notna())]

# Rename columns to match the required format
danish_mp4_df = danish_mp4_df.rename(columns={
    'transcript_unique_id': 'transcript_id',
    'mp4_url': 'mp4_video_link'
})



# Reorder columns to match the required format
danish_mp4_df = danish_mp4_df[['transcript_id', 'mp4_video_link']]



# Save the filtered DataFrame to a new CSV file
output_path_danish_mp4 = 'links/danish_mp4_media_links_since_2018.csv'
danish_mp4_df.to_csv(output_path_danish_mp4, index=False)

# Print some information about the result
print(f"Filtered Danish dataset saved to: {output_path_danish_mp4}")
print(f"Total rows in filtered Danish dataset: {len(danish_mp4_df)}")

# Display the first few rows of the filtered Danish dataset
print("\nFirst few rows of the filtered Danish dataset:")
print(danish_mp4_df.head())


Filtered Danish dataset saved to: links/danish_mp4_media_links_since_2018.csv
Total rows in filtered Danish dataset: 59186

First few rows of the filtered Danish dataset:
         transcript_id                                     mp4_video_link
0  149_20190318T144449  https://vod.althingi.is/upptokur/old/20190318T...
1  149_20190318T144901  https://vod.althingi.is/upptokur/old/20190318T...
2  149_20190320T152530  https://vod.althingi.is/upptokur/old/20190320T...
3  149_20190320T165429  https://vod.althingi.is/upptokur/old/20190320T...
4  149_20190320T165846  https://vod.althingi.is/upptokur/old/20190320T...


In [2]:
# Count the number of rows with an mp4 link
num_mp4_links = combined_df['mp4_url'].notna().sum()

# Count the number of rows with an mp3 link
num_mp3_links = combined_df['mp3_url'].notna().sum()

# Print the results
print(f"Number of rows with an mp4 link: {num_mp4_links}")
print(f"Number of rows with an mp3 link: {num_mp3_links}")


Number of rows with an mp4 link: 59186
Number of rows with an mp3 link: 59350


In [None]:
# Save the combined DataFrame to a new CSV file
output_path = 'results/media_links_00000-63720.csv'
combined_df.to_csv(output_path, index=False)

# Print some information about the result
print(f"Combined {len(dfs)} files")
print(f"Total rows in combined dataset: {len(combined_df)}")
print(f"Output saved to: {output_path}")

# Display the first few rows
print("\nFirst few rows of the combined dataset:")
print(combined_df.head())