V2 : added support for end tags, separators (., _) between words, and TV series identification

In [1]:
#!pip install ipywidgets

# 1- Imports

In [2]:
import os
import re
import pandas as pd
import tmdbsimple as tmdb
import time
from collections import defaultdict
from tqdm.notebook import tqdm

In [3]:
# Common list of end-of-file tags (quality, codec, release group) used to delimit the proper title.
RELEASE_TAGS = [
    r'1080p', r'720p', r'4k', r'2160p', r'bluray', r'webrip', r'webdl', r'hdtv', r'dvdrip', 
    r'x264', r'x265', r'h264', r'h265', r'ac3', r'eac3', r'dts', r'multi', r'truefrench', 
    r'vostfr', r'subfrench', r'fra', r'eng', r'hd', r'bd', r'remux'
]

# 2- Config

In [None]:
# ‚ö†Ô∏è Replace with your TMDb API Key
tmdb.API_KEY = 'd1caf3d9d5d31be93bace5476d9ccfd9'  # 'YOUR_TMDB_API_KEY_HERE'

# ‚ö†Ô∏è Replace with the actual path to your media folder
TARGET_FOLDER_PATH = r"E:\Films\#01 converted\newly converted" # r"C:\Your\Media\Folder"

In [None]:
def clean_name_and_extract_year(name):
    """
    Cleans the file name, extracts year/TV pattern, and isolates the title.
    """
    # 1. Initial cleanup: Remove extension
    name_no_ext = os.path.splitext(name)[0]

    # 2. Check for TV series markers (S01E01, S02, etc.)
    is_tv_pattern = bool(re.search(r'[sS]\d+[eE]\d+|\s[sS]\d+\s', name_no_ext))
    
    # 3. Standardize separators for easier parsing: replace . and _ with spaces
    # We ignore '.' that are part of file extensions before this step (already handled by os.path.splitext)
    working_name = name_no_ext.replace('.', ' ').replace('_', ' ').strip()
    
    year = None
    
    # 4. Extract Year: Search for a 4-digit number that might stand alone or be enclosed in () or []
    year_match = re.search(r'[\s\(](\d{4})[\s\)]?', working_name)
    if year_match:
        year = year_match.group(1)
        # Remove the year from the working name to clean the title
        working_name = working_name.replace(year_match.group(0), ' ').strip()
    
    # 5. Isolate the title: Find the first occurrence of a known release tag or TV tag (S0xE0x)
    
    # Combine TV episode/season pattern with release tags to find the end of the title
    tags_to_find = RELEASE_TAGS + [r'[sS]\d+[eE]\d+', r'[sS]\d+']
    tag_regex = re.compile(r'\b(' + '|'.join(tags_to_find) + r')\b', re.IGNORECASE)
    
    match = tag_regex.search(working_name)
    
    if match:
        # Title is the part of the string before the tag is found
        title_part = working_name[:match.start()].strip()
    else:
        title_part = working_name
    
    # Final cleanup: Remove any remaining excessive spaces or dashes
    cleaned_title = re.sub(r'\s+', ' ', title_part).strip()

    # The title should generally stop if we see "S01E01" (if not already handled)
    if is_tv_pattern:
        tv_match = re.search(r'([sS]\d+[eE]\d+)', cleaned_title)
        if tv_match:
             cleaned_title = cleaned_title[:tv_match.start()].strip()


    # Simple Fallback: if cleaning failed and the title is too long, try just the first few words
    if len(cleaned_title.split()) > 10:
         cleaned_title = ' '.join(cleaned_title.split()[:5])

    return cleaned_title, year, is_tv_pattern

def search_media_info(title, year=None, is_tv_pattern=False):
    """
    Searches TMDb for the title, prioritizing TV search if a pattern is detected.
    """
    if not title:
        return "Invalid Title", None

    director = "Not Specified"
    saga_name = None
    media_type = None
    search = tmdb.Search()

    try:
        if is_tv_pattern:
            # 1. PRIORITY 1a: Search for TV Series
            response = search.tv(query=title)
            media_type = 'tv'
        else:
            # 1. PRIORITY 1b: Search for Movie
            response = search.movie(query=title, year=year)
            media_type = 'movie'

        # 2. PRIORITY 2: Fallback search if the first one yields nothing
        if not response['results']:
            if media_type == 'movie':
                # Fallback from movie to TV
                response = search.tv(query=title)
                media_type = 'tv'
            else:
                # Fallback from TV to movie
                response = search.movie(query=title, year=year)
                media_type = 'movie'

            if not response['results']:
                return "Not Found", None
        
        # 3. Get information from the first result
        result = response['results'][0]
        media_id = result['id']
        
        # Ensure media_type is correctly set based on the final result
        media_type = result.get('media_type', media_type) 

        if media_type == 'movie':
            movie = tmdb.Movies(media_id)
            info = movie.info() 
            credits = movie.credits()

            # Find the Director
            for crew_member in credits.get('crew', []):
                if crew_member.get('job') == 'Director':
                    director = crew_member.get('name')
                    break
            
            # Find the Collection
            collection = info.get('belongs_to_collection')
            if collection:
                saga_name = collection.get('name')
        
        elif media_type == 'tv':
            director = "TV Series"

        return director, saga_name

    except Exception as e:
        # Capture all other errors (API, network issues)
        return f"API Search Error ({type(e).__name__})", None
        
    return "Not Found", None

### Fonction principale de scan

In [None]:
def scan_and_group_media(folder_path):
    
    films_by_director = defaultdict(list)
    films_by_saga = defaultdict(list)
    raw_results = []
    titles_processed = set()

    items_to_analyze = []
    for current_path, sub_folders, files in os.walk(folder_path):
        # We process folder names (usually cleaner) and potential video files
        if current_path == folder_path:
             items_to_analyze.extend(sub_folders)
        
        # Add potential video files
        for f in files:
            if os.path.splitext(f)[1].lower() in ['.mp4', '.mkv', '.avi', '.mov', '.wmv']:
                 items_to_analyze.append(f)

    # tqdm is for the progress bar
    unique_items_to_analyze = list(set(items_to_analyze))

    for name in tqdm(unique_items_to_analyze, desc="üîç Analyzing Titles and Searching API"):
        
        cleaned_title, year, is_tv_pattern = clean_name_and_extract_year(name)
        
        if cleaned_title and cleaned_title not in titles_processed:
            
            director, saga_name = search_media_info(cleaned_title, year, is_tv_pattern)
            
            # raw results
            raw_results.append({
                'Original Name': name,
                'Cleaned Title': cleaned_title,
                'Year': year if year else '-',
                'Director': director,
                'Saga': saga_name if saga_name else '-',
                'Status': 'OK' # Default status
            })
            
            last_result = raw_results[-1]

            # Setup Manual Check
            if last_result['Director'] == "TV Series":
                last_result['Status'] = 'TV Series'
            elif last_result['Director'] in ["Not Found", "Not Specified"] or "API Search Error" in last_result['Director']:
                last_result['Status'] = 'Manual Check'


            # Grouping Logic (only for identified movies)
            if last_result['Status'] == 'OK':
                films_by_director[director].append(cleaned_title)
                if saga_name:
                    films_by_saga[saga_name].append(cleaned_title)
                
            titles_processed.add(cleaned_title)
            
            time.sleep(0.3) 

    return pd.DataFrame(raw_results), films_by_director, films_by_saga

### Main

In [None]:
if os.path.exists(TARGET_FOLDER_PATH):
    print(f"üé¨ Starting scan and online search in: {TARGET_FOLDER_PATH}\n")
    
    df_raw, grouped_by_director, grouped_by_saga = scan_and_group_media(TARGET_FOLDER_PATH)
    
    
    # --- RESULT 4: MANUAL CHECK ---
    print("\n" + "="*70)
    print("‚ùå RESULT 4: MEDIA REQUIRING MANUAL CHECK")
    print("="*70)

    df_manual = df_raw[df_raw['Status'] == 'Manual Check']
    
    if not df_manual.empty:
        print(f"‚ö†Ô∏è {len(df_manual)} media files require manual verification:")
        display(df_manual[['Original Name', 'Cleaned Title', 'Director', 'Saga', 'Status']])
    else:
        print("No media requires immediate manual verification.")


    # --- RESULT 5: TV SERIES LIST ---
    print("\n" + "="*70)
    print("üì∫ RESULT 5: TV SERIES IDENTIFIED")
    print("="*70)
    df_tv = df_raw[df_raw['Status'] == 'TV Series']
    if not df_tv.empty:
        # We only display basic titles
        unique_tv_series = df_tv['Cleaned Title'].unique()
        print(f"‚úÖ {len(unique_tv_series)} unique TV series found:")
        print("\n".join([f"    - {title}" for title in unique_tv_series]))
    else:
        print("No TV series found (or identified as TV Series).")
        
    # --- RESULT 1: Grouping by Collection ---
    print("\n" + "="*70)
    print("‚úÖ RESULT 1: GROUPED BY COLLECTION (More than 1 film)")
    print("="*70)
    
    sagas_multiples = {
        saga: films 
        for saga, films in grouped_by_saga.items() 
        if len(films) > 1
    }
    
    if sagas_multiples:
        print(f"üìΩÔ∏è  {len(sagas_multiples)} Saga(s) identified:")
        for saga, films in sagas_multiples.items():
            print(f"--- **{saga}** ({len(films)} films) ---")
            for film in sorted(films):
                print(f"    - {film}")
            print("\n")
    else:
        print("No sagas containing more than one film were found.")

    # --- RESULT 2: Grouping by Director (Remaining Unique Films) ---
    print("\n" + "="*70)
    print("‚úÖ RESULT 2: GROUPED BY DIRECTOR (Unique Films)")
    print("="*70)

    directors_multiples = {
        director: films 
        for director, films in grouped_by_director.items() 
        if len(films) > 1
    }

    # Filter out films already grouped by Collection
    films_in_sagas = set()
    for films in sagas_multiples.values():
        films_in_sagas.update(films)

    directors_unique = {}
    for director, films in directors_multiples.items():
        remaining_films = [f for f in films if f not in films_in_sagas]
        if len(remaining_films) > 1:
            directors_unique[director] = remaining_films

    if directors_unique:
        print(f"üìΩÔ∏è  {len(directors_unique)} Director(s) with multiple films (excluding sagas above):")
        for director, films in directors_unique.items():
            print(f"--- **{director}** ({len(films)} films) ---")
            for film in sorted(films):
                print(f"    - {film}")
            print("\n")
    else:
        print("No director has directed more than one film not classified in a saga.")
            
    # --- RESULT 3: Display Raw Data ---
    print("\n" + "="*70)
    print("üìã RAW ANALYSIS DETAILS (50 first)")
    print("="*70)
    display(df_raw.head(50))

else:
    print(f"‚ùå Error: The specified path does not exist: {TARGET_FOLDER_PATH}")

üé¨ Starting scan and online search in: E:\Films\#01 converted\newly converted



üîç Analyzing Titles and Searching API:   0%|          | 0/376 [00:00<?, ?it/s]


‚ùå RESULT 4: MEDIA REQUIRING MANUAL CHECK
‚ö†Ô∏è 6 media files require manual verification:


Unnamed: 0,Original Name,Cleaned Title,Director,Saga,Status
38,#02 Shows,#02 Shows,Not Found,-,Manual Check
54,Sauver ou perir (2017).mp4,Sauver ou perir,Not Found,-,Manual Check
116,Le Chant Du Loup (2018).mp4,Le Chant Du Loup,Not Found,-,Manual Check
124,Le Proces Goldman (2023) 10bit.mp4,Le Proces Goldman 10bit,Not Found,-,Manual Check
166,The Covenant (2023).mp4,The Covenant,Not Specified,-,Manual Check
214,The Aviator (2004) v2.mp4,The Aviator v2,Not Found,-,Manual Check



üì∫ RESULT 5: TV SERIES IDENTIFIED
‚úÖ 7 unique TV series found:
    - House of the Dragon
    - Game of Thrones
    - The Pitt
    - Wednesday
    - Game Of Thrones
    - Euphoria
    - Stranger Things

‚úÖ RESULT 1: GROUPED BY SAGA/COLLECTION (More than 1 film)
üìΩÔ∏è  7 Saga(s) identified:
--- **Star Wars Collection** (9 films) ---
    - Star Wars Episode I The Phantom Menace
    - Star Wars Episode II Attack Of The Clones
    - Star Wars Episode III Revenge Of The Sith
    - Star Wars Episode IV A New Hope
    - Star Wars Episode Ix The Rise Of Skywalker
    - Star Wars Episode V The Empire Strikes Back
    - Star Wars Episode VI Return Of The Jedi
    - Star Wars Episode VII The Force Awakens
    - Star Wars Episode Viii The Last Jedi


--- **The Incredibles Collection** (2 films) ---
    - Les Indestructibles
    - Les Indestructibles 2


--- **Cars Collection** (3 films) ---
    - Cars
    - Cars 2
    - Cars 3


--- **Mad Max Collection** (3 films) ---
    - Mad Max
    - Ma

Unnamed: 0,Original Name,Cleaned Title,Year,Director,Saga,Status
0,Citizen Kane (1941).mp4,Citizen Kane,1941,Orson Welles,-,OK
1,Star Wars Episode V The Empire Strikes Back (1...,Star Wars Episode V The Empire Strikes Back,1980,Irvin Kershner,Star Wars Collection,OK
2,La Vie d'Ad√®le (2013) x265.mp4,La Vie d'Ad√®le,2013,Abdellatif Kechiche,-,OK
3,House of the Dragon S02E05.mp4,House of the Dragon,-,TV Series,-,TV Series
4,Nightcrawler (2014) x264 v2.mp4,Nightcrawler,2014,Dan Gilroy,-,OK
5,Game of Thrones S04E03.mp4,Game of Thrones,-,TV Series,-,TV Series
6,Ben-Hur (1959).mkv,Ben-Hur,1959,William Wyler,-,OK
7,The Big Short (2015) x265.mp4,The Big Short,2015,Adam McKay,-,OK
8,The Machinist (2004).mp4,The Machinist,2004,Brad Anderson,-,OK
9,Fantastic Mr Fox (2009).mp4,Fantastic Mr Fox,2009,Wes Anderson,-,OK
