In [None]:
import pandas as pd

def process_imdb_data(input_file_path, output_file_path):
    try:
        df = pd.read_csv(input_file_path, delimiter='\t', low_memory=False)
    except Exception as e:
        print(f"Failed to read the TSV file: {e}")
        return
    if df.empty:
        print("The DataFrame is empty.")
        return
    if 'directors' not in df.columns:
        print("directors column is missing in the DataFrame.")
        return

    df = df[df['directors'] != '\\N']

    df_genres = df['directors'].str.split(',', expand=True).stack().reset_index(level=1, drop=True)
    df_genres.name = 'director'

    df_normalized = df.drop('directors', axis=1).join(df_genres)[['tconst', 'director']]

    try:
        df_normalized.to_csv(output_file_path, sep='\t', index=False, compression='gzip')
    except Exception as e:
        print(f"Failed to write the compressed TSV file: {e}")

path_to_input_file = '/content/title.crew.tsv.gz'
path_to_output_file = 'directors.tsv.gz'

process_imdb_data(path_to_input_file, path_to_output_file)


In [None]:
import pandas as pd

def process_imdb_data(input_file_path, output_file_path):
    chunk_size = 50000
    try:
        with open(output_file_path, 'w', newline='') as file_out:
            header = True

            for i, df in enumerate(pd.read_csv(input_file_path, delimiter='\t', low_memory=False, chunksize=chunk_size)):
                print(f"Processing chunk {i+1}")

                df = df[df['writers'] != '\\N']

                df_expanded = df['writers'].str.split(',', expand=True).stack().reset_index(level=1, drop=True)
                df_expanded.name = 'writer'

                df_normalized = df.drop('writers', axis=1).join(df_expanded)[['tconst', 'writer']]


                df_normalized.to_csv(file_out, sep='\t', index=False, header=header, mode='a')
                header = False


                print(f"Chunk {i+1} processed: {df_normalized.shape[0]} rows")
    except Exception as e:
        print(f"Failed during processing: {e}")


path_to_input_file = '/content/title.crew.tsv.gz'
path_to_output_file = 'writers.tsv.gz'

process_imdb_data(path_to_input_file, path_to_output_file)


In [None]:
import pandas as pd

def process_imdb_data(input_file_path, output_file_path):
    try:

        df = pd.read_csv(input_file_path, delimiter='\t', low_memory=False)
    except Exception as e:
        print(f"Failed to read the TSV file: {e}")
        return

    if df.empty:
        print("The DataFrame is empty.")
        return
    if 'knownForTitles' not in df.columns:
        print("knownForTitles column is missing in the DataFrame.")
        return

    df = df[df['knownForTitles'] != '\\N']

    df_genres = df['knownForTitles'].str.split(',', expand=True).stack().reset_index(level=1, drop=True)
    df_genres.name = 'knownForTitles'

    df_normalized = df.drop('knownForTitles', axis=1).join(df_genres)[['nconst', 'knownForTitle']]

    try:
        df_normalized.to_csv(output_file_path, sep='\t', index=False, compression='gzip')
    except Exception as e:
        print(f"Failed to write the compressed TSV file: {e}")

path_to_input_file = '/content/name.basics.tsv.gz'
path_to_output_file = 'knownForTitles.tsv.gz'

process_imdb_data(path_to_input_file, path_to_output_file)


In [None]:
import pandas as pd

def process_directors(input_file_path, output_file_path):
    try:
        df = pd.read_csv(input_file_path, delimiter='\t', low_memory=False)
    except Exception as e:
        print(f"Failed to read the TSV file: {e}")
        return

    if df.empty:
        print("The DataFrame is empty.")
        return
    if 'directors' not in df.columns:
        print("Directors column is missing in the DataFrame.")
        return

    df = df[df['directors'] != '\\N']

    df_directors = df['directors'].str.split(',', expand=True).stack().reset_index(level=1, drop=True)
    df_directors.name = 'director'

    df_normalized = df.drop('directors', axis=1).join(df_directors)[['tconst', 'director']]

    try:
        df_normalized.to_csv(output_file_path, sep='\t', index=False, compression='gzip')
    except Exception as e:
        print(f"Failed to write the compressed TSV file: {e}")

path_to_input_file = '/content/title.crew.tsv'
path_to_output_file = 'directors.tsv.gz'

process_directors(path_to_input_file, path_to_output_file)


In [None]:
import pandas as pd
import gzip

def process_akas_data(input_file_path, output_file_path):
    chunk_size = 50000  # Adjust the chunk size based on your RAM constraints
    try:
        with gzip.open(output_file_path, 'wt') as file_out:  # Use gzip for writing compressed output
            header = True  # To write header only once
            # Process the file in chunks
            for i, df in enumerate(pd.read_csv(input_file_path, delimiter='\t', low_memory=False, chunksize=chunk_size)):
                print(f"Processing chunk {i+1}")

                # Handle 'types' and 'attributes' columns by taking the first element of the split
                if 'types' in df.columns:
                    df['types'] = df['types'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)
                if 'attributes' in df.columns:
                    df['attributes'] = df['attributes'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)

                # Write processed chunk to file with compression
                df.to_csv(file_out, sep='\t', index=False, header=header)
                header = False  # Disable header after the first chunk

                # Provide some output about the process
                print(f"Chunk {i+1} processed: {df.shape[0]} rows")
    except Exception as e:
        print(f"Failed during processing: {e}")

# Define the paths
path_to_input_file = '/content/title.akas.tsv.gz'  # Akas file path
path_to_output_file = 'processed_akas.tsv.gz'  # Output file path

# Process the data
process_akas_data(path_to_input_file, path_to_output_file)

In [None]:
import pandas as pd
import gzip
from multiprocessing import Pool

def load_valid_tconsts(basics_file_path):
    with gzip.open(basics_file_path, 'rt') as file:
        df_basics = pd.read_csv(file, usecols=['tconst'], delimiter='\t', low_memory=False)
    return pd.Index(df_basics['tconst'])

def process_chunk(df_chunk, valid_tconsts, file_out_path, is_first_chunk):
    df_chunk = df_chunk[df_chunk['tconst'].isin(valid_tconsts) & (df_chunk['writers'] != '\\N')]
    df_expanded = df_chunk['writers'].str.split(',', expand=True).stack().reset_index(level=1, drop=True)
    df_expanded.name = 'writer'

    df_normalized = df_chunk[['tconst']].join(df_expanded)

    with gzip.open(file_out_path, 'at' if not is_first_chunk else 'wt') as file_out:
        df_normalized.to_csv(file_out, sep='\t', index=False, header=is_first_chunk)
    return df_normalized.shape[0]

def process_writers(input_file_path, output_file_path, valid_tconsts):
    chunk_size = 50000
    with pd.read_csv(input_file_path, delimiter='\t', low_memory=False, chunksize=chunk_size) as reader:
        with Pool(processes=4) as pool:
            results = [pool.apply_async(process_chunk, (df, valid_tconsts, output_file_path, i == 0))
                       for i, df in enumerate(reader)]

            for result in results:
                rows_processed = result.get()
                print(f"Chunk processed: {rows_processed} rows")

path_to_basics_file = '/content/title.basics.tsv.gz'
path_to_input_file = '/content/title.crew.tsv.gz'
path_to_output_file = 'writers.tsv.gz'

valid_tconsts = load_valid_tconsts(path_to_basics_file)

process_writers(path_to_input_file, path_to_output_file, valid_tconsts)
