99 "regexp"
1010 "strings"
1111 "time"
12+ "unicode/utf8"
1213
1314 "github.com/OpenListTeam/OpenList/v4/internal/model"
1415)
@@ -19,6 +20,52 @@ var yearRegexp = regexp.MustCompile(`\b((?:19|20)\d{2})\b`)
1920// chineseRegexp 匹配包含中文字符的片段
2021var chineseRegexp = regexp .MustCompile (`[\p{Han}]` )
2122
23+ // 噪声词正则:发布组、编码、分辨率、音轨等无意义片段
24+ // 清洗中文标题尾部常见的污染词
25+ var noiseTokenRegexp = regexp .MustCompile (`(?i)(双语字幕|中字|国英|国粤|粤语|国语|英语|日语|韩语|HDTV|HR-HDTV|BluRay|BDRip|WEB-?DL|HDRip|DVDRip|REMUX|x264|x265|h264|h265|HEVC|AVC|AAC|AC3|DTS|FLAC|10bit|8bit|HDR|SDR|4K|2160P|1080P|720P|480P|完整版|未删减版)` )
26+
27+ // 中文数字到阿拉伯数字的简单映射,用于「钢铁侠三」=>「钢铁侠3」类的归一化
28+ var cnNumMap = map [string ]string {
29+ "〇" : "0" , "零" : "0" , "一" : "1" , "二" : "2" , "三" : "3" , "四" : "4" ,
30+ "五" : "5" , "六" : "6" , "七" : "7" , "八" : "8" , "九" : "9" , "十" : "10" ,
31+ }
32+
33+ // normalizeTitle 对标题做模糊匹配前的归一化处理
34+ // - 去除括号及其中内容
35+ // - 去除版本/编码等噪声词
36+ // - 合并多余空白
37+ func normalizeTitle (s string ) string {
38+ if s == "" {
39+ return s
40+ }
41+ // 去掉中英文括号包裹的内容
42+ bracketRe := regexp .MustCompile (`[\((\[【][^\))\]】]*[\))\]】]` )
43+ s = bracketRe .ReplaceAllString (s , " " )
44+ // 去掉常见噪声词
45+ s = noiseTokenRegexp .ReplaceAllString (s , " " )
46+ // 替换分隔符为空格
47+ s = strings .NewReplacer ("." , " " , "_" , " " , "-" , " " , "+" , " " ).Replace (s )
48+ // 合并多余空白
49+ s = regexp .MustCompile (`\s+` ).ReplaceAllString (s , " " )
50+ return strings .TrimSpace (s )
51+ }
52+
53+ // cnNumToArabic 将标题中的中文数字归一化为阿拉伯数字(仅做轻量处理)
54+ func cnNumToArabic (s string ) string {
55+ if s == "" {
56+ return s
57+ }
58+ var b strings.Builder
59+ for _ , r := range s {
60+ if v , ok := cnNumMap [string (r )]; ok {
61+ b .WriteString (v )
62+ } else {
63+ b .WriteRune (r )
64+ }
65+ }
66+ return b .String ()
67+ }
68+
2269// parsedVideoTitle 解析后的视频标题信息
2370type parsedVideoTitle struct {
2471 EnglishTitle string // 英文标题(第一个中文片段之前、年份之前的部分)
@@ -165,13 +212,32 @@ type tmdbMovieDetail struct {
165212}
166213
167214// doTMDBSearch 执行一次TMDB搜索请求
168- func (s * TMDBScraper ) doTMDBSearch (query , year string ) (* tmdbSearchResult , error ) {
169- searchURL := fmt .Sprintf ("%s/search/multi?api_key=%s&query=%s&language=zh-CN&search_type=ngram" ,
170- s .BaseURL , s .APIKey , url .QueryEscape (query ))
215+ // endpoint 取值:multi / movie / tv
216+ // language 取值:zh-CN / en-US / 空(不传 language,TMDB 会按用户语言或英文)
217+ // 注意:search_type=ngram 是已废弃参数,不再使用;TMDB 默认即支持 substring 匹配
218+ func (s * TMDBScraper ) doTMDBSearch (endpoint , query , year , language string ) (* tmdbSearchResult , error ) {
219+ if endpoint == "" {
220+ endpoint = "multi"
221+ }
222+ params := url.Values {}
223+ params .Set ("api_key" , s .APIKey )
224+ params .Set ("query" , query )
225+ params .Set ("include_adult" , "true" )
226+ if language != "" {
227+ params .Set ("language" , language )
228+ }
171229 if year != "" {
172- searchURL += "&year=" + year
230+ // movie 用 year / primary_release_year,tv 用 first_air_date_year
231+ switch endpoint {
232+ case "tv" :
233+ params .Set ("first_air_date_year" , year )
234+ default :
235+ params .Set ("year" , year )
236+ }
173237 }
174238
239+ searchURL := fmt .Sprintf ("%s/search/%s?%s" , s .BaseURL , endpoint , params .Encode ())
240+
175241 resp , err := s .client .Get (searchURL )
176242 if err != nil {
177243 return nil , fmt .Errorf ("TMDB搜索请求失败: %w" , err )
@@ -189,46 +255,117 @@ func (s *TMDBScraper) doTMDBSearch(query, year string) (*tmdbSearchResult, error
189255 return nil , fmt .Errorf ("TMDB搜索结果解析失败(status=%d, url=%s): %w, body=%s" ,
190256 resp .StatusCode , searchURL , err , snippet )
191257 }
258+ // /search/multi 返回的结果带 media_type,/search/movie /search/tv 不带,需要补齐
259+ if endpoint == "movie" || endpoint == "tv" {
260+ for i := range result .Results {
261+ if result .Results [i ].MediaType == "" {
262+ result .Results [i ].MediaType = endpoint
263+ }
264+ }
265+ }
192266 return & result , nil
193267}
194268
195- // searchWithFallback 带降级重试的TMDB搜索
196- // 策略:
197- // 1. 有中文标题时,先用中文标题 + 年份搜索,再用中文标题不带年份搜索
198- // 2. 有英文标题时,用英文标题 + 年份搜索,再用英文标题不带年份搜索
199- // 3. 全部搜索失败才返回错误
200- func (s * TMDBScraper ) searchWithFallback (parsed parsedVideoTitle ) (* tmdbSearchResult , error ) {
201- type searchAttempt struct {
202- query string
203- year string
269+ // searchAttempt 单次搜索尝试参数
270+ type searchAttempt struct {
271+ endpoint string // multi / movie / tv
272+ query string
273+ year string
274+ language string
275+ }
276+
277+ // buildTitleCandidates 根据原始标题构造一组候选搜索词(按优先级返回)
278+ // 候选包含:原始 -> 归一化 -> 阿拉伯数字归一化 -> 拆分子词
279+ func buildTitleCandidates (title string ) []string {
280+ if title == "" {
281+ return nil
282+ }
283+ seen := make (map [string ]bool )
284+ var out []string
285+ add := func (s string ) {
286+ s = strings .TrimSpace (s )
287+ if s == "" || seen [s ] {
288+ return
289+ }
290+ seen [s ] = true
291+ out = append (out , s )
292+ }
293+
294+ add (title )
295+ norm := normalizeTitle (title )
296+ add (norm )
297+ add (cnNumToArabic (norm ))
298+
299+ // 若中文标题里夹杂了空格分隔的多个词,把每个非短词单独作为候选
300+ for _ , w := range strings .Fields (norm ) {
301+ if utf8 .RuneCountInString (w ) >= 2 {
302+ add (w )
303+ }
204304 }
305+ return out
306+ }
205307
308+ // searchWithFallback 带降级重试的TMDB搜索
309+ // 策略(按优先级,命中即停止):
310+ // 1. 中文标题候选 × {带年份, 不带年份} × {movie, tv, multi} × language=zh-CN
311+ // 2. 英文标题候选 × {带年份, 不带年份} × {movie, tv, multi} × language=en-US
312+ // 3. 中文标题候选 × multi × 不指定 language(最后兜底)
313+ func (s * TMDBScraper ) searchWithFallback (parsed parsedVideoTitle ) (* tmdbSearchResult , error ) {
206314 var attempts []searchAttempt
207315
208- // 中文标题优先
209- if parsed .ChineseTitle != "" {
210- if parsed .Year != "" {
211- attempts = append (attempts , searchAttempt {parsed .ChineseTitle , parsed .Year })
316+ addGroup := func (title , lang string ) {
317+ if title == "" {
318+ return
319+ }
320+ cands := buildTitleCandidates (title )
321+ // 同一个候选词,先尝试 movie + 年份,再 tv + 年份,再 movie 无年份,再 tv 无年份,最后 multi
322+ for _ , q := range cands {
323+ if parsed .Year != "" {
324+ attempts = append (attempts ,
325+ searchAttempt {"movie" , q , parsed .Year , lang },
326+ searchAttempt {"tv" , q , parsed .Year , lang },
327+ )
328+ }
329+ attempts = append (attempts ,
330+ searchAttempt {"movie" , q , "" , lang },
331+ searchAttempt {"tv" , q , "" , lang },
332+ searchAttempt {"multi" , q , "" , lang },
333+ )
212334 }
213- attempts = append (attempts , searchAttempt {parsed .ChineseTitle , "" })
214335 }
215336
216- // 英文标题兜底
217- if parsed .EnglishTitle != "" {
218- if parsed .Year != "" {
219- attempts = append (attempts , searchAttempt {parsed .EnglishTitle , parsed .Year })
337+ // 中文标题优先(zh-CN)
338+ addGroup (parsed .ChineseTitle , "zh-CN" )
339+ // 英文标题兜底(en-US)
340+ addGroup (parsed .EnglishTitle , "en-US" )
341+ // 最后再用中文标题不指定语言搜一次(兜底,TMDB 多语言别名匹配可能命中)
342+ if parsed .ChineseTitle != "" {
343+ for _ , q := range buildTitleCandidates (parsed .ChineseTitle ) {
344+ attempts = append (attempts , searchAttempt {"multi" , q , "" , "" })
220345 }
221- attempts = append (attempts , searchAttempt {parsed .EnglishTitle , "" })
222346 }
223347
224348 if len (attempts ) == 0 {
225349 return nil , fmt .Errorf ("无法从文件名中提取有效标题" )
226350 }
227351
352+ // 去重,避免重复请求
353+ type key struct { ep , q , y , l string }
354+ done := make (map [key ]bool )
355+
356+ var lastErr error
228357 for _ , attempt := range attempts {
229- result , err := s .doTMDBSearch (attempt .query , attempt .year )
358+ k := key {attempt .endpoint , attempt .query , attempt .year , attempt .language }
359+ if done [k ] {
360+ continue
361+ }
362+ done [k ] = true
363+
364+ result , err := s .doTMDBSearch (attempt .endpoint , attempt .query , attempt .year , attempt .language )
230365 if err != nil {
231- return nil , err
366+ // 网络/解析错误不立即终止,记录后继续尝试下一个候选
367+ lastErr = err
368+ continue
232369 }
233370 if len (result .Results ) > 0 {
234371 return result , nil
@@ -240,6 +377,9 @@ func (s *TMDBScraper) searchWithFallback(parsed parsedVideoTitle) (*tmdbSearchRe
240377 if titleInfo == "" {
241378 titleInfo = parsed .EnglishTitle
242379 }
380+ if lastErr != nil {
381+ return nil , fmt .Errorf ("TMDB未找到匹配结果: %s (last err: %v)" , titleInfo , lastErr )
382+ }
243383 return nil , fmt .Errorf ("TMDB未找到匹配结果: %s" , titleInfo )
244384}
245385
0 commit comments